Date: Mon, 28 Aug 2017 01:54:24 +0200 From: Daniel Roethlisberger <daniel@roe.ch> To: freebsd-hackers@freebsd.org Subject: Re: [PATCH] O_NOATIME support for open(2) Message-ID: <20170827235424.GA34762@schoggimuss.roe.ch> In-Reply-To: <20170827141708.GV1700@kib.kiev.ua> References: <20170826161827.GA21456@schoggimuss.roe.ch> <20170826175606.GQ1700@kib.kiev.ua> <20170827131806.GB21456@schoggimuss.roe.ch> <20170827141708.GV1700@kib.kiev.ua>
next in thread | previous in thread | raw e-mail | index | archive | help
--envbJBWh7q8WU6mo Content-Type: text/plain; charset=us-ascii Content-Disposition: inline Konstantin Belousov <kostikbel@gmail.com> 2017-08-27: > > > > The attached patch against 11.1 implements O_NOATIME support for > > > > open(2); it prevents read(2) and mmap(2) from clobbering atime if > > > > the file descriptor was opened with O_NOATIME. O_NOATIME is only > > > > permitted for root and the owner of the file. Currently it is > > > > only implemented for ufs/ffs. It seems to work for me but has > > > > not been extensively tested. > > > What would happen when additional page-in occurs on the mmaped area ? > > > > With mmap, the vnode is marked for atime update at the time of > > calling mmap (unless O_NOATIME is set on the fd). I do not see > > how the patch would impact page-ins in any way. Can you > > elaborate? > > I mean, do we have some code paths which would cause page-ins to set > atime ? If we currently do not have that, fine. My brief reading of the > code suggests that we do not, at least for UFS. Thanks for clarifying. I am not aware of any atime-updating page-in code paths. > Somewhat related, if an image file is opened O_EXEC | O_NOATIME, does > calling fexecve(2) on the fd prevents atime update with your patch ? > It seems to me that the case is not handled. Correct, thanks. > Note that in kernel code, we usually prefer O_XXX spelling for the open > flags over the FXXX. I removed FNOATIME entirely based on your feedback. Attached a revised patch that incorporates most of the feedback, plus supports setting/unsetting O_NOATIME through fcntl and handles many of the local file systems. It also enables O_NOATIME in the Linux compat shim. Docs are still missing, and testing on other filesystems than UFS very limited or not at all. Additional feedback much appreciated, especially on the code in kern_fcntl and vn_open_vnode. Daniel -- Daniel Roethlisberger http://daniel.roe.ch/ --envbJBWh7q8WU6mo Content-Type: text/x-diff; charset=us-ascii Content-Disposition: attachment; filename="onoatime-v3.diff" diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c index 17819bc..ea736f1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -820,7 +820,13 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) out: zfs_range_unlock(rl); +#ifdef __FreeBSD__ + if (ioflag & IO_NOATIME == 0) { + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + } +#else ZFS_ACCESSTIME_STAMP(zfsvfs, zp); +#endif /* __FreeBSD__ */ ZFS_EXIT(zfsvfs); return (error); } diff --git a/sys/compat/linux/linux_file.c b/sys/compat/linux/linux_file.c index 09133eb..2bf40c4 100644 --- a/sys/compat/linux/linux_file.c +++ b/sys/compat/linux/linux_file.c @@ -130,7 +130,8 @@ linux_common_open(struct thread *td, int dirfd, char *path, int l_flags, int mod bsd_flags |= O_NOFOLLOW; if (l_flags & LINUX_O_DIRECTORY) bsd_flags |= O_DIRECTORY; - /* XXX LINUX_O_NOATIME: unable to be easily implemented. */ + if (l_flags & LINUX_O_NOATIME) + bsd_flags |= O_NOATIME; error = kern_openat(td, dirfd, path, UIO_SYSSPACE, bsd_flags, mode); if (error != 0) @@ -1327,6 +1328,8 @@ fcntl_common(struct thread *td, struct linux_fcntl_args *args) if (result & O_DIRECT) td->td_retval[0] |= LINUX_O_DIRECT; #endif + if (result & O_NOATIME) + td->td_retval[0] |= LINUX_O_NOATIME; return (error); case LINUX_F_SETFL: @@ -1347,6 +1350,8 @@ fcntl_common(struct thread *td, struct linux_fcntl_args *args) if (args->arg & LINUX_O_DIRECT) arg |= O_DIRECT; #endif + if (args->arg & LINUX_O_NOATIME) + arg |= O_NOATIME; return (kern_fcntl(td, args->fd, F_SETFL, arg)); case LINUX_F_GETLK: diff --git a/sys/fs/ext2fs/ext2_vnops.c b/sys/fs/ext2fs/ext2_vnops.c index da3b267..a9d4702 100644 --- a/sys/fs/ext2fs/ext2_vnops.c +++ b/sys/fs/ext2fs/ext2_vnops.c @@ -1765,6 +1765,7 @@ ext2_ind_read(struct vop_read_args *ap) } if ((error == 0 || uio->uio_resid != orig_resid) && + ((ioflag & IO_NOATIME) == 0) && (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) ip->i_flag |= IN_ACCESS; return (error); diff --git a/sys/fs/msdosfs/msdosfs_vnops.c b/sys/fs/msdosfs/msdosfs_vnops.c index cfabfaa..03b0ab2 100644 --- a/sys/fs/msdosfs/msdosfs_vnops.c +++ b/sys/fs/msdosfs/msdosfs_vnops.c @@ -600,6 +600,7 @@ msdosfs_read(struct vop_read_args *ap) brelse(bp); } while (error == 0 && uio->uio_resid > 0 && n != 0); if (!isadir && (error == 0 || uio->uio_resid != orig_resid) && + ((ap->a_ioflag & IO_NOATIME) == 0) && (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0) dep->de_flag |= DE_ACCESS; return (error); diff --git a/sys/fs/tmpfs/tmpfs_vnops.c b/sys/fs/tmpfs/tmpfs_vnops.c index 21baeb7..543d7fc 100644 --- a/sys/fs/tmpfs/tmpfs_vnops.c +++ b/sys/fs/tmpfs/tmpfs_vnops.c @@ -476,7 +476,8 @@ tmpfs_read(struct vop_read_args *v) if (uio->uio_offset < 0) return (EINVAL); node = VP_TO_TMPFS_NODE(vp); - tmpfs_set_status(node, TMPFS_NODE_ACCESSED); + if ((v->a_ioflag & IO_NOATIME) == 0) + tmpfs_set_status(node, TMPFS_NODE_ACCESSED); return (uiomove_object(node->tn_reg.tn_aobj, node->tn_size, uio)); } diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index bea81a5..0f97601 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -553,6 +553,27 @@ kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg) cap_rights_init(&rights, CAP_FCNTL), F_SETFL, &fp); if (error != 0) break; + if ((arg & O_NOATIME) && ((fp->f_flag & O_NOATIME) == 0)) { + if (fp->f_type != DTYPE_VNODE) { + error = ENOTTY; + fdrop(fp, td); + break; + } + if ((fp->f_flag & FREAD) == 0) { + error = EOPNOTSUPP; + fdrop(fp, td); + break; + } + vp = fp->f_vnode; + vrefact(vp); + vn_lock(vp, LK_SHARED | LK_RETRY); + error = VOP_ACCESS(vp, VADMIN, td->td_ucred, td); + vput(vp); + if (error != 0) { + fdrop(fp, td); + break; + } + } do { tmp = flg = fp->f_flag; tmp &= ~FCNTLFLAGS; @@ -2820,10 +2841,23 @@ fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp } int -fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp) +fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp, + struct file **fpp) { + int error; - return (_fgetvp(td, fd, FEXEC, rightsp, vpp)); + *vpp = NULL; + error = _fget(td, fd, fpp, FEXEC, rightsp, NULL); + if (error != 0) + return (error); + if ((*fpp)->f_vnode == NULL) { + error = EINVAL; + } else { + *vpp = (*fpp)->f_vnode; + vrefact(*vpp); + } + + return (error); } #ifdef notyet diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index 1a3cc42..0fcfeee 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -381,6 +381,8 @@ do_execve(td, args, mac_p) #ifdef HWPMC_HOOKS struct pmckern_procexec pe; #endif + struct file *fp; + boolean_t noatime = FALSE; static const char fexecv_proc_title[] = "(fexecv)"; imgp = &image_params; @@ -453,9 +455,11 @@ interpret: * Descriptors opened only with O_EXEC or O_RDONLY are allowed. */ error = fgetvp_exec(td, args->fd, - cap_rights_init(&rights, CAP_FEXECVE), &newtextvp); + cap_rights_init(&rights, CAP_FEXECVE), &newtextvp, &fp); if (error) goto exec_fail; + noatime = fp->f_flag & O_NOATIME; + fdrop(fp, td); vn_lock(newtextvp, LK_EXCLUSIVE | LK_RETRY); AUDIT_ARG_VNODE1(newtextvp); imgp->vp = newtextvp; @@ -880,7 +884,8 @@ interpret: else exec_setregs(td, imgp, (u_long)(uintptr_t)stack_base); - vfs_mark_atime(imgp->vp, td->td_ucred); + if (!noatime) + vfs_mark_atime(imgp->vp, td->td_ucred); SDT_PROBE1(proc, , , exec__success, args->fname); diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 3138dda..6c64da4 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -309,6 +309,8 @@ vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred, return (EOPNOTSUPP); if (vp->v_type != VDIR && fmode & O_DIRECTORY) return (ENOTDIR); + if ((fmode & O_NOATIME) && ((fmode & FREAD) == 0)) + return (EOPNOTSUPP); accmode = 0; if (fmode & (FWRITE | O_TRUNC)) { if (vp->v_type == VDIR) @@ -317,6 +319,8 @@ vn_open_vnode(struct vnode *vp, int fmode, struct ucred *cred, } if (fmode & FREAD) accmode |= VREAD; + if ((fmode & O_NOATIME) && (fmode & FREAD)) + accmode |= VADMIN; if (fmode & FEXEC) accmode |= VEXEC; if ((fmode & O_APPEND) && (fmode & FWRITE)) @@ -798,6 +802,8 @@ vn_read(fp, uio, active_cred, flags, td) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; + if (fp->f_flag & O_NOATIME) + ioflag |= IO_NOATIME; advice = get_advice(fp, uio); vn_lock(vp, LK_SHARED | LK_RETRY); @@ -2398,6 +2404,7 @@ vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_object_t object; vm_prot_t maxprot; boolean_t writecounted; + boolean_t noatime; int error; #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ @@ -2470,9 +2477,10 @@ vn_mmap(struct file *fp, vm_map_t map, vm_offset_t *addr, vm_size_t size, foff < 0 || foff > OFF_MAX - size) return (EINVAL); + noatime = fp->f_flag & O_NOATIME; writecounted = FALSE; error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, vp, - &foff, &object, &writecounted); + &foff, &object, &writecounted, noatime); if (error != 0) return (error); error = vm_mmap_object(map, addr, size, prot, maxprot, flags, object, diff --git a/sys/sys/fcntl.h b/sys/sys/fcntl.h index d1d0062..71a8e59 100644 --- a/sys/sys/fcntl.h +++ b/sys/sys/fcntl.h @@ -133,6 +133,8 @@ typedef __pid_t pid_t; #define O_VERIFY 0x00200000 /* open only after verification */ #endif +#define O_NOATIME 0x00400000 /* do not update atime */ + /* * XXX missing O_DSYNC, O_RSYNC. */ @@ -150,9 +152,9 @@ typedef __pid_t pid_t; #define OFLAGS(fflags) ((fflags) & O_EXEC ? (fflags) : (fflags) - 1) /* bits to save after open */ -#define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK|O_DIRECT|FEXEC) +#define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK|O_DIRECT|FEXEC|O_NOATIME) /* bits settable by fcntl(F_SETFL, ...) */ -#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FRDAHEAD|O_DIRECT) +#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FRDAHEAD|O_DIRECT|O_NOATIME) #if defined(COMPAT_FREEBSD7) || defined(COMPAT_FREEBSD6) || \ defined(COMPAT_FREEBSD5) || defined(COMPAT_FREEBSD4) @@ -164,7 +166,7 @@ typedef __pid_t pid_t; #define FPOSIXSHM O_NOFOLLOW #undef FCNTLFLAGS #define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM|FRDAHEAD| \ - O_DIRECT) + O_DIRECT|O_NOATIME) #endif #endif diff --git a/sys/sys/file.h b/sys/sys/file.h index d14ec32..d70eb6d 100644 --- a/sys/sys/file.h +++ b/sys/sys/file.h @@ -259,7 +259,7 @@ void finit(struct file *, u_int, short, void *, struct fileops *); int fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp); int fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, - struct vnode **vpp); + struct vnode **vpp, struct file **fpp); int fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp, struct filecaps *havecaps, struct vnode **vpp); int fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index dedbec6..28e0923 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -302,6 +302,7 @@ struct vattr { #define IO_INVAL 0x0040 /* invalidate after I/O */ #define IO_SYNC 0x0080 /* do I/O synchronously */ #define IO_DIRECT 0x0100 /* attempt to bypass buffer cache */ +#define IO_NOATIME 0x0200 /* do not update atime */ #define IO_EXT 0x0400 /* operate on external attributes */ #define IO_NORMAL 0x0800 /* operate on regular data */ #define IO_NOMACCHECK 0x1000 /* MAC checks unnecessary */ diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c index b1de1b8..aa3aeb4 100644 --- a/sys/ufs/ffs/ffs_vnops.c +++ b/sys/ufs/ffs/ffs_vnops.c @@ -671,6 +671,7 @@ ffs_read(ap) } if ((error == 0 || uio->uio_resid != orig_resid) && + ((ioflag & IO_NOATIME) == 0) && (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0 && (ip->i_flag & IN_ACCESS) == 0) { VI_LOCK(vp); diff --git a/sys/vm/vm_extern.h b/sys/vm/vm_extern.h index c37973d..7513122 100644 --- a/sys/vm/vm_extern.h +++ b/sys/vm/vm_extern.h @@ -94,7 +94,7 @@ int vm_mmap_to_errno(int rv); int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, int *, struct cdev *, struct cdevsw *, vm_ooffset_t *, vm_object_t *); int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, int *, - struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); + struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *, boolean_t); void vm_set_page_size(void); void vm_sync_icache(vm_map_t, vm_offset_t, vm_size_t); typedef int (*pmap_pinit_t)(struct pmap *pmap); diff --git a/sys/vm/vm_mmap.c b/sys/vm/vm_mmap.c index d0f14f3..470a459 100644 --- a/sys/vm/vm_mmap.c +++ b/sys/vm/vm_mmap.c @@ -1192,7 +1192,7 @@ int vm_mmap_vnode(struct thread *td, vm_size_t objsize, vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, - boolean_t *writecounted) + boolean_t *writecounted, boolean_t noatime) { struct vattr va; vm_object_t obj; @@ -1283,7 +1283,8 @@ vm_mmap_vnode(struct thread *td, vm_size_t objsize, *objp = obj; *flagsp = flags; - vfs_mark_atime(vp, cred); + if (!noatime) + vfs_mark_atime(vp, cred); done: if (error != 0 && *writecounted) { @@ -1400,7 +1401,7 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, } case OBJT_VNODE: error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, - handle, &foff, &object, &writecounted); + handle, &foff, &object, &writecounted, FALSE); break; case OBJT_DEFAULT: if (handle == NULL) { --envbJBWh7q8WU6mo--
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20170827235424.GA34762>