Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 28 Aug 2017 01:54:24 +0200
From:      Daniel Roethlisberger <daniel@roe.ch>
To:        freebsd-hackers@freebsd.org
Subject:   Re: [PATCH] O_NOATIME support for open(2)
Message-ID:  <20170827235424.GA34762@schoggimuss.roe.ch>
In-Reply-To: <20170827141708.GV1700@kib.kiev.ua>
References:  <20170826161827.GA21456@schoggimuss.roe.ch> <20170826175606.GQ1700@kib.kiev.ua> <20170827131806.GB21456@schoggimuss.roe.ch> <20170827141708.GV1700@kib.kiev.ua>

next in thread | previous in thread | raw e-mail | index | archive | help

--envbJBWh7q8WU6mo
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline

Konstantin Belousov <kostikbel@gmail.com> 2017-08-27:
> > > > The attached patch against 11.1 implements O_NOATIME support for
> > > > open(2); it prevents read(2) and mmap(2) from clobbering atime if
> > > > the file descriptor was opened with O_NOATIME.  O_NOATIME is only
> > > > permitted for root and the owner of the file.  Currently it is
> > > > only implemented for ufs/ffs.  It seems to work for me but has
> > > > not been extensively tested.
> > > What would happen when additional page-in occurs on the mmaped area ?
> > 
> > With mmap, the vnode is marked for atime update at the time of
> > calling mmap (unless O_NOATIME is set on the fd).  I do not see
> > how the patch would impact page-ins in any way.  Can you
> > elaborate?
> 
> I mean, do we have some code paths which would cause page-ins to set
> atime ? If we currently do not have that, fine. My brief reading of the
> code suggests that we do not, at least for UFS.

Thanks for clarifying.  I am not aware of any atime-updating
page-in code paths.

> Somewhat related, if an image file is opened O_EXEC | O_NOATIME, does
> calling fexecve(2) on the fd prevents atime update with your patch ?
> It seems to me that the case is not handled.

Correct, thanks.

> Note that in kernel code, we usually prefer O_XXX spelling for the open
> flags over the FXXX.

I removed FNOATIME entirely based on your feedback.

Attached a revised patch that incorporates most of the feedback,
plus supports setting/unsetting O_NOATIME through fcntl and
handles many of the local file systems.  It also enables
O_NOATIME in the Linux compat shim.  Docs are still missing, and
testing on other filesystems than UFS very limited or not at all.

Additional feedback much appreciated, especially on the code in
kern_fcntl and vn_open_vnode.

Daniel

-- 
Daniel Roethlisberger
http://daniel.roe.ch/


--envbJBWh7q8WU6mo
Content-Type: text/x-diff; charset=us-ascii
Content-Disposition: attachment; filename="onoatime-v3.diff"

diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
index 17819bc..ea736f1 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
@@ -820,7 +820,13 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 out:
 	zfs_range_unlock(rl);
 
+#ifdef __FreeBSD__
+	if (ioflag & IO_NOATIME == 0) {
+		ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+	}
+#else
 	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
+#endif /* __FreeBSD__ */
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
diff --git a/sys/compat/linux/linux_file.c b/sys/compat/linux/linux_file.c
index 09133eb..2bf40c4 100644
--- a/sys/compat/linux/linux_file.c
+++ b/sys/compat/linux/linux_file.c
@@ -130,7 +130,8 @@ linux_common_open(struct thread *td, int dirfd, char *path, int l_flags, int mod
 		bsd_flags |= O_NOFOLLOW;
 	if (l_flags & LINUX_O_DIRECTORY)
 		bsd_flags |= O_DIRECTORY;
-	/* XXX LINUX_O_NOATIME: unable to be easily implemented. */
+	if (l_flags & LINUX_O_NOATIME)
+		bsd_flags |= O_NOATIME;
 
 	error = kern_openat(td, dirfd, path, UIO_SYSSPACE, bsd_flags, mode);
 	if (error != 0)
@@ -1327,6 +1328,8 @@ fcntl_common(struct thread *td, struct linux_fcntl_args *args)
 		if (result & O_DIRECT)
 			td->td_retval[0] |= LINUX_O_DIRECT;
 #endif
+		if (result & O_NOATIME)
+			td->td_retval[0] |= LINUX_O_NOATIME;
 		return (error);
 
 	case LINUX_F_SETFL:
@@ -1347,6 +1350,8 @@ fcntl_common(struct thread *td, struct linux_fcntl_args *args)
 		if (args->arg & LINUX_O_DIRECT)
 			arg |= O_DIRECT;
 #endif
+		if (args->arg & LINUX_O_NOATIME)
+			arg |= O_NOATIME;
 		return (kern_fcntl(td, args->fd, F_SETFL, arg));
 
 	case LINUX_F_GETLK:
diff --git a/sys/fs/ext2fs/ext2_vnops.c b/sys/fs/ext2fs/ext2_vnops.c
index da3b267..a9d4702 100644
--- a/sys/fs/ext2fs/ext2_vnops.c
+++ b/sys/fs/ext2fs/ext2_vnops.c
@@ -1765,6 +1765,7 @@ ext2_ind_read(struct vop_read_args *ap)
 	}
 
 	if ((error == 0 || uio->uio_resid != orig_resid) &&
+	    ((ioflag & IO_NOATIME) == 0) &&
 	    (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
 		ip->i_flag |= IN_ACCESS;
 	return (error);
diff --git a/sys/fs/msdosfs/msdosfs_vnops.c b/sys/fs/msdosfs/msdosfs_vnops.c
index cfabfaa..03b0ab2 100644
--- a/sys/fs/msdosfs/msdosfs_vnops.c
+++ b/sys/fs/msdosfs/msdosfs_vnops.c
@@ -600,6 +600,7 @@ msdosfs_read(struct vop_read_args *ap)
 		brelse(bp);
 	} while (error == 0 && uio->uio_resid > 0 && n != 0);
 	if (!isadir && (error == 0 || uio->uio_resid != orig_resid) &&
+	    ((ap->a_ioflag & IO_NOATIME) == 0) &&
 	    (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
 		dep->de_flag |= DE_ACCESS;
 	return (error);
diff --git a/sys/fs/tmpfs/tmpfs_vnops.c b/sys/fs/tmpfs/tmpfs_vnops.c
index 21baeb7..543d7fc 100644
--- a/sys/fs/tmpfs/tmpfs_vnops.c
+++ b/sys/fs/tmpfs/tmpfs_vnops.c
@@ -476,7 +476,8 @@ tmpfs_read(struct vop_read_args *v)
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 	node = VP_TO_TMPFS_NODE(vp);
-	tmpfs_set_status(node, TMPFS_NODE_ACCESSED);
+	if ((v->a_ioflag & IO_NOATIME) == 0)
+		tmpfs_set_status(node, TMPFS_NODE_ACCESSED);
 	return (uiomove_object(node->tn_reg.tn_aobj, node->tn_size, uio));
 }
 
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index bea81a5..0f97601 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -553,6 +553,27 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 		    cap_rights_init(&rights, CAP_FCNTL), F_SETFL, &fp);
 		if (error != 0)
 			break;
+		if ((arg & O_NOATIME) && ((fp->f_flag & O_NOATIME) == 0)) {
+			if (fp->f_type != DTYPE_VNODE) {
+				error = ENOTTY;
+				fdrop(fp, td);
+				break;
+			}
+			if ((fp->f_flag & FREAD) == 0) {
+				error = EOPNOTSUPP;
+				fdrop(fp, td);
+				break;
+			}
+			vp = fp->f_vnode;
+			vrefact(vp);
+			vn_lock(vp, LK_SHARED | LK_RETRY);
+			error = VOP_ACCESS(vp, VADMIN, td->td_ucred, td);
+			vput(vp);
+			if (error != 0) {
+				fdrop(fp, td);
+				break;
+			}
+		}
 		do {
 			tmp = flg = fp->f_flag;
 			tmp &= ~FCNTLFLAGS;
@@ -2820,10 +2841,23 @@ fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp
 }
 
 int
-fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
+fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp,
+    struct file **fpp)
 {
+	int error;
 
-	return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
+	*vpp = NULL;
+	error = _fget(td, fd, fpp, FEXEC, rightsp, NULL);
+	if (error != 0)
+		return (error);
+	if ((*fpp)->f_vnode == NULL) {
+		error = EINVAL;
+	} else {
+		*vpp = (*fpp)->f_vnode;
+		vrefact(*vpp);
+	}
+
+	return (error);
 }
 
 #ifdef notyet
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
index 1a3cc42..0fcfeee 100644
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -381,6 +381,8 @@ do_execve(td, args, mac_p)
 #ifdef HWPMC_HOOKS
 	struct pmckern_procexec pe;
 #endif
+	struct file *fp;
+	boolean_t noatime = FALSE;
 	static const char fexecv_proc_title[] = "(fexecv)";
 
 	imgp = &image_params;
@@ -453,9 +455,11 @@ interpret:
 		 * Descriptors opened only with O_EXEC or O_RDONLY are allowed.
 		 */
 		error = fgetvp_exec(td, args->fd,
-		    cap_rights_init(&rights, CAP_FEXECVE), &newtextvp);
+		    cap_rights_init(&rights, CAP_FEXECVE), &newtextvp, &fp);
 		if (error)
 			goto exec_fail;
+		noatime = fp->f_flag & O_NOATIME;
+		fdrop(fp, td);
 		vn_lock(newtextvp, LK_EXCLUSIVE | LK_RETRY);
 		AUDIT_ARG_VNODE1(newtextvp);
 		imgp->vp = newtextvp;
@@ -880,7 +884,8 @@ interpret:
 	else
 		exec_setregs(td, imgp, (u_long)(uintptr_t)stack_base);
 
-	vfs_mark_atime(imgp->vp, td->td_ucred);
+	if (!noatime)
+		vfs_mark_atime(imgp->vp, td->td_ucred);
 
 	SDT_PROBE1(proc, , , exec__success, args->fname);
 
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 3138dda..6c64da4 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -309,6 +309,8 @@ vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
 		return (EOPNOTSUPP);
 	if (vp->v_type != VDIR && fmode & O_DIRECTORY)
 		return (ENOTDIR);
+	if ((fmode & O_NOATIME) && ((fmode & FREAD) == 0))
+		return (EOPNOTSUPP);
 	accmode = 0;
 	if (fmode & (FWRITE | O_TRUNC)) {
 		if (vp->v_type == VDIR)
@@ -317,6 +319,8 @@ vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred,
 	}
 	if (fmode & FREAD)
 		accmode |= VREAD;
+	if ((fmode & O_NOATIME) && (fmode & FREAD))
+		accmode |= VADMIN;
 	if (fmode & FEXEC)
 		accmode |= VEXEC;
 	if ((fmode & O_APPEND) && (fmode & FWRITE))
@@ -798,6 +802,8 @@ vn_read(fp, uio, active_cred, flags, td)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
+	if (fp->f_flag & O_NOATIME)
+		ioflag |= IO_NOATIME;
 	advice = get_advice(fp, uio);
 	vn_lock(vp, LK_SHARED | LK_RETRY);
 
@@ -2398,6 +2404,7 @@ vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
 	vm_object_t object;
 	vm_prot_t maxprot;
 	boolean_t writecounted;
+	boolean_t noatime;
 	int error;
 
 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
@@ -2470,9 +2477,10 @@ vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size,
 	    foff < 0 || foff > OFF_MAX - size)
 		return (EINVAL);
 
+	noatime = fp->f_flag & O_NOATIME;
 	writecounted = FALSE;
 	error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp,
-	    &foff, &object, &writecounted);
+	    &foff, &object, &writecounted, noatime);
 	if (error != 0)
 		return (error);
 	error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object,
diff --git a/sys/sys/fcntl.h b/sys/sys/fcntl.h
index d1d0062..71a8e59 100644
--- a/sys/sys/fcntl.h
+++ b/sys/sys/fcntl.h
@@ -133,6 +133,8 @@ typedef	__pid_t		pid_t;
 #define	O_VERIFY	0x00200000	/* open only after verification */
 #endif
 
+#define O_NOATIME	0x00400000	/* do not update atime */
+
 /*
  * XXX missing O_DSYNC, O_RSYNC.
  */
@@ -150,9 +152,9 @@ typedef	__pid_t		pid_t;
 #define	OFLAGS(fflags)	((fflags) & O_EXEC ? (fflags) : (fflags) - 1)
 
 /* bits to save after open */
-#define	FMASK	(FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK|O_DIRECT|FEXEC)
+#define	FMASK	(FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK|O_DIRECT|FEXEC|O_NOATIME)
 /* bits settable by fcntl(F_SETFL, ...) */
-#define	FCNTLFLAGS	(FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FRDAHEAD|O_DIRECT)
+#define	FCNTLFLAGS	(FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FRDAHEAD|O_DIRECT|O_NOATIME)
 
 #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \
     defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4)
@@ -164,7 +166,7 @@ typedef	__pid_t		pid_t;
 #define	FPOSIXSHM	O_NOFOLLOW
 #undef FCNTLFLAGS
 #define	FCNTLFLAGS	(FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM|FRDAHEAD| \
-			 O_DIRECT)
+			 O_DIRECT|O_NOATIME)
 #endif
 #endif
 
diff --git a/sys/sys/file.h b/sys/sys/file.h
index d14ec32..d70eb6d 100644
--- a/sys/sys/file.h
+++ b/sys/sys/file.h
@@ -259,7 +259,7 @@ void finit(struct file *, u_int, short, void *, struct fileops *);
 int fgetvp(struct thread *td, int fd, cap_rights_t *rightsp,
     struct vnode **vpp);
 int fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp,
-    struct vnode **vpp);
+    struct vnode **vpp, struct file **fpp);
 int fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
     struct filecaps *havecaps, struct vnode **vpp);
 int fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp,
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index dedbec6..28e0923 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -302,6 +302,7 @@ struct vattr {
 #define	IO_INVAL	0x0040		/* invalidate after I/O */
 #define	IO_SYNC		0x0080		/* do I/O synchronously */
 #define	IO_DIRECT	0x0100		/* attempt to bypass buffer cache */
+#define	IO_NOATIME	0x0200		/* do not update atime */
 #define	IO_EXT		0x0400		/* operate on external attributes */
 #define	IO_NORMAL	0x0800		/* operate on regular data */
 #define	IO_NOMACCHECK	0x1000		/* MAC checks unnecessary */
diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c
index b1de1b8..aa3aeb4 100644
--- a/sys/ufs/ffs/ffs_vnops.c
+++ b/sys/ufs/ffs/ffs_vnops.c
@@ -671,6 +671,7 @@ ffs_read(ap)
 	}
 
 	if ((error == 0 || uio->uio_resid != orig_resid) &&
+	    ((ioflag & IO_NOATIME) == 0) &&
 	    (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0 &&
 	    (ip->i_flag & IN_ACCESS) == 0) {
 		VI_LOCK(vp);
diff --git a/sys/vm/vm_extern.h b/sys/vm/vm_extern.h
index c37973d..7513122 100644
--- a/sys/vm/vm_extern.h
+++ b/sys/vm/vm_extern.h
@@ -94,7 +94,7 @@ int vm_mmap_to_errno(int rv);
 int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *,
     int *, struct cdev *, struct cdevsw *, vm_ooffset_t *, vm_object_t *);
 int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, int *,
-    struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *);
+    struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *, boolean_t);
 void vm_set_page_size(void);
 void vm_sync_icache(vm_map_t, vm_offset_t, vm_size_t);
 typedef int (*pmap_pinit_t)(struct pmap *pmap);
diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c
index d0f14f3..470a459 100644
--- a/sys/vm/vm_mmap.c
+++ b/sys/vm/vm_mmap.c
@@ -1192,7 +1192,7 @@ int
 vm_mmap_vnode(struct thread *td, vm_size_t objsize,
     vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp,
     struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp,
-    boolean_t *writecounted)
+    boolean_t *writecounted, boolean_t noatime)
 {
 	struct vattr va;
 	vm_object_t obj;
@@ -1283,7 +1283,8 @@ vm_mmap_vnode(struct thread *td, vm_size_t objsize,
 	*objp = obj;
 	*flagsp = flags;
 
-	vfs_mark_atime(vp, cred);
+	if (!noatime)
+		vfs_mark_atime(vp, cred);
 
 done:
 	if (error != 0 && *writecounted) {
@@ -1400,7 +1401,7 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot,
 	}
 	case OBJT_VNODE:
 		error = vm_mmap_vnode(td, size, prot, &maxprot, &flags,
-		    handle, &foff, &object, &writecounted);
+		    handle, &foff, &object, &writecounted, FALSE);
 		break;
 	case OBJT_DEFAULT:
 		if (handle == NULL) {

--envbJBWh7q8WU6mo--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20170827235424.GA34762>