Date: Fri, 15 Jul 2011 16:20:33 +0000 (UTC) From: Kirk McKusick <mckusick@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r224061 - in head/sys/ufs: ffs ufs Message-ID: <201107151620.p6FGKX98086893@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: mckusick Date: Fri Jul 15 16:20:33 2011 New Revision: 224061 URL: http://svn.freebsd.org/changeset/base/224061 Log: Add an FFS specific mount option to allow a filesystem checker (typically fsck_ffs) to register that it wishes to use FFS specific sysctl's to update the filesystem. This ensures that two checkers cannot run on a given filesystem at the same time and that no other process accidentally or maliciously uses the filesystem updating sysctls inappropriately. This functionality is needed by the journaling soft-updates recovery code. Modified: head/sys/ufs/ffs/ffs_alloc.c head/sys/ufs/ffs/ffs_inode.c head/sys/ufs/ffs/ffs_vfsops.c head/sys/ufs/ffs/fs.h head/sys/ufs/ufs/ufsmount.h Modified: head/sys/ufs/ffs/ffs_alloc.c ============================================================================== --- head/sys/ufs/ffs/ffs_alloc.c Fri Jul 15 15:56:23 2011 (r224060) +++ head/sys/ufs/ffs/ffs_alloc.c Fri Jul 15 16:20:33 2011 (r224061) @@ -2381,6 +2381,18 @@ ffs_fserr(fs, inum, cp) * in the current directory is oldvalue then change it to newvalue. * unlink(nameptr, oldvalue) - Verify that the inode number associated * with nameptr in the current directory is oldvalue then unlink it. + * + * The following functions may only be used on a quiescent filesystem + * by the soft updates journal. They are not safe to be run on an active + * filesystem. + * + * setinode(inode, dip) - the specified disk inode is replaced with the + * contents pointed to by dip. + * setbufoutput(fd, flags) - output associated with the specified file + * descriptor (which must reference the character device supporting + * the filesystem) switches from using physio to running through the + * buffer cache when flags is set to 1. The descriptor reverts to + * physio for output when flags is set to zero. */ static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS); @@ -2427,11 +2439,21 @@ static SYSCTL_NODE(_vfs_ffs, FFS_SET_DOT static SYSCTL_NODE(_vfs_ffs, FFS_UNLINK, unlink, CTLFLAG_WR, sysctl_ffs_fsck, "Unlink a Duplicate Name"); +static SYSCTL_NODE(_vfs_ffs, FFS_SET_INODE, setinode, CTLFLAG_WR, + sysctl_ffs_fsck, "Update an On-Disk Inode"); + +static SYSCTL_NODE(_vfs_ffs, FFS_SET_BUFOUTPUT, setbufoutput, CTLFLAG_WR, + sysctl_ffs_fsck, "Set Buffered Writing for Descriptor"); + +#define DEBUG 1 #ifdef DEBUG -static int fsckcmds = 0; +static int fsckcmds = 1; SYSCTL_INT(_debug, OID_AUTO, fsckcmds, CTLFLAG_RW, &fsckcmds, 0, ""); #endif /* DEBUG */ +static int buffered_write(struct file *, struct uio *, struct ucred *, + int, struct thread *); + static int sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) { @@ -2445,8 +2467,10 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) ufs2_daddr_t blkno; long blkcnt, blksize; struct filedesc *fdp; - struct file *fp; + struct file *fp, *vfp; int vfslocked, filetype, error; + static struct fileops *origops, bufferedops; + static int outcnt = 0; if (req->newlen > sizeof cmd) return (EBADRPC); @@ -2454,7 +2478,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) return (error); if (cmd.version != FFS_CMD_VERSION) return (ERPCMISMATCH); - if ((error = getvnode(curproc->p_fd, cmd.handle, &fp)) != 0) + if ((error = getvnode(td->td_proc->p_fd, cmd.handle, &fp)) != 0) return (error); vp = fp->f_data; if (vp->v_type != VREG && vp->v_type != VDIR) { @@ -2467,12 +2491,13 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) fdrop(fp, td); return (EINVAL); } - if (mp->mnt_flag & MNT_RDONLY) { + ump = VFSTOUFS(mp); + if ((mp->mnt_flag & MNT_RDONLY) && + ump->um_fsckpid != td->td_proc->p_pid) { vn_finished_write(mp); fdrop(fp, td); return (EROFS); } - ump = VFSTOUFS(mp); fs = ump->um_fs; filetype = IFREG; @@ -2493,7 +2518,7 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) case FFS_ADJ_REFCNT: #ifdef DEBUG if (fsckcmds) { - printf("%s: adjust inode %jd count by %jd\n", + printf("%s: adjust inode %jd link count by %jd\n", mp->mnt_stat.f_mntonname, (intmax_t)cmd.value, (intmax_t)cmd.size); } @@ -2504,7 +2529,8 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) ip->i_nlink += cmd.size; DIP_SET(ip, i_nlink, ip->i_nlink); ip->i_effnlink += cmd.size; - ip->i_flag |= IN_CHANGE; + ip->i_flag |= IN_CHANGE | IN_MODIFIED; + error = ffs_update(vp, 1); if (DOINGSOFTDEP(vp)) softdep_change_linkcnt(ip); vput(vp); @@ -2522,7 +2548,8 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) break; ip = VTOI(vp); DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + cmd.size); - ip->i_flag |= IN_CHANGE; + ip->i_flag |= IN_CHANGE | IN_MODIFIED; + error = ffs_update(vp, 1); vput(vp); break; @@ -2722,6 +2749,78 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) UIO_USERSPACE, (ino_t)cmd.size); break; + case FFS_SET_INODE: + if (ump->um_fsckpid != td->td_proc->p_pid) { + error = EPERM; + break; + } +#ifdef DEBUG + if (fsckcmds && outcnt++ < 100) { + printf("%s: update inode %jd\n", + mp->mnt_stat.f_mntonname, (intmax_t)cmd.value); + } +#endif /* DEBUG */ + if ((error = ffs_vget(mp, (ino_t)cmd.value, LK_EXCLUSIVE, &vp))) + break; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + AUDIT_ARG_VNODE1(vp); + ip = VTOI(vp); + if (ip->i_ump->um_fstype == UFS1) + error = copyin((void *)(intptr_t)cmd.size, ip->i_din1, + sizeof(struct ufs1_dinode)); + else + error = copyin((void *)(intptr_t)cmd.size, ip->i_din2, + sizeof(struct ufs2_dinode)); + if (error) { + vput(vp); + VFS_UNLOCK_GIANT(vfslocked); + break; + } + ip->i_flag |= IN_CHANGE | IN_MODIFIED; + error = ffs_update(vp, 1); + vput(vp); + VFS_UNLOCK_GIANT(vfslocked); + break; + + case FFS_SET_BUFOUTPUT: + if (ump->um_fsckpid != td->td_proc->p_pid) { + error = EPERM; + break; + } + if (VTOI(vp)->i_ump != ump) { + error = EINVAL; + break; + } +#ifdef DEBUG + if (fsckcmds) { + printf("%s: %s buffered output for descriptor %jd\n", + mp->mnt_stat.f_mntonname, + cmd.size == 1 ? "enable" : "disable", + (intmax_t)cmd.value); + } +#endif /* DEBUG */ + if ((error = getvnode(td->td_proc->p_fd, cmd.value, &vfp)) != 0) + break; + if (vfp->f_vnode->v_type != VCHR) { + fdrop(vfp, td); + error = EINVAL; + break; + } + if (origops == NULL) { + origops = vfp->f_ops; + bcopy((void *)origops, (void *)&bufferedops, + sizeof(bufferedops)); + bufferedops.fo_write = buffered_write; + } + if (cmd.size == 1) + atomic_store_rel_ptr((volatile uintptr_t *)&vfp->f_ops, + (uintptr_t)&bufferedops); + else + atomic_store_rel_ptr((volatile uintptr_t *)&vfp->f_ops, + (uintptr_t)origops); + fdrop(vfp, td); + break; + default: #ifdef DEBUG if (fsckcmds) { @@ -2737,3 +2836,73 @@ sysctl_ffs_fsck(SYSCTL_HANDLER_ARGS) vn_finished_write(mp); return (error); } + +/* + * Function to switch a descriptor to use the buffer cache to stage + * its I/O. This is needed so that writes to the filesystem device + * will give snapshots a chance to copy modified blocks for which it + * needs to retain copies. + */ +static int +buffered_write(fp, uio, active_cred, flags, td) + struct file *fp; + struct uio *uio; + struct ucred *active_cred; + int flags; + struct thread *td; +{ + struct vnode *devvp; + struct inode *ip; + struct buf *bp; + struct fs *fs; + int error, vfslocked; + daddr_t lbn; + static int outcnt = 0; + + /* + * The devvp is associated with the /dev filesystem. To discover + * the filesystem with which the device is associated, we depend + * on the application setting the current directory to a location + * within the filesystem being written. Yes, this is an ugly hack. + */ + devvp = fp->f_vnode; + ip = VTOI(td->td_proc->p_fd->fd_cdir); + if (ip->i_devvp != devvp) + return (EINVAL); + fs = ip->i_fs; + vfslocked = VFS_LOCK_GIANT(ip->i_vnode->v_mount); + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + if ((flags & FOF_OFFSET) == 0) + uio->uio_offset = fp->f_offset; +#ifdef DEBUG + if (fsckcmds && outcnt++ < 100) { + printf("%s: buffered write for block %jd\n", + fs->fs_fsmnt, (intmax_t)btodb(uio->uio_offset)); + } +#endif /* DEBUG */ + /* + * All I/O must be contained within a filesystem block, start on + * a fragment boundary, and be a multiple of fragments in length. + */ + if (uio->uio_resid > fs->fs_bsize - (uio->uio_offset % fs->fs_bsize) || + fragoff(fs, uio->uio_offset) != 0 || + fragoff(fs, uio->uio_resid) != 0) { + error = EINVAL; + goto out; + } + lbn = numfrags(fs, uio->uio_offset); + bp = getblk(devvp, lbn, uio->uio_resid, 0, 0, 0); + bp->b_flags |= B_RELBUF; + if ((error = uiomove((char *)bp->b_data, uio->uio_resid, uio)) != 0) { + brelse(bp); + goto out; + } + error = bwrite(bp); + if ((flags & FOF_OFFSET) == 0) + fp->f_offset = uio->uio_offset; + fp->f_nextoff = uio->uio_offset; +out: + VOP_UNLOCK(devvp, 0); + VFS_UNLOCK_GIANT(vfslocked); + return (error); +} Modified: head/sys/ufs/ffs/ffs_inode.c ============================================================================== --- head/sys/ufs/ffs/ffs_inode.c Fri Jul 15 15:56:23 2011 (r224060) +++ head/sys/ufs/ffs/ffs_inode.c Fri Jul 15 16:20:33 2011 (r224061) @@ -90,7 +90,7 @@ ffs_update(vp, waitfor) return (0); ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED); fs = ip->i_fs; - if (fs->fs_ronly) + if (fs->fs_ronly && ip->i_ump->um_fsckpid == 0) return (0); error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)), (int)fs->fs_bsize, NOCRED, &bp); Modified: head/sys/ufs/ffs/ffs_vfsops.c ============================================================================== --- head/sys/ufs/ffs/ffs_vfsops.c Fri Jul 15 15:56:23 2011 (r224060) +++ head/sys/ufs/ffs/ffs_vfsops.c Fri Jul 15 16:20:33 2011 (r224061) @@ -132,8 +132,8 @@ static struct buf_ops ffs_ops = { */ static const char *ffs_opts[] = { "acls", "async", "noatime", "noclusterr", "noclusterw", "noexec", "export", "force", "from", "groupquota", - "multilabel", "nfsv4acls", "snapshot", "nosuid", "suiddir", "nosymfollow", - "sync", "union", "userquota", NULL }; + "multilabel", "nfsv4acls", "fsckpid", "snapshot", "nosuid", "suiddir", + "nosymfollow", "sync", "union", "userquota", NULL }; static int ffs_mount(struct mount *mp) @@ -142,6 +142,7 @@ ffs_mount(struct mount *mp) struct thread *td; struct ufsmount *ump = 0; struct fs *fs; + pid_t fsckpid = 0; int error, flags; u_int mntorflags; accmode_t accmode; @@ -184,6 +185,29 @@ ffs_mount(struct mount *mp) vfs_deleteopt(mp->mnt_opt, "snapshot"); } + if (vfs_getopt(mp->mnt_optnew, "fsckpid", NULL, NULL) == 0 && + vfs_scanopt(mp->mnt_optnew, "fsckpid", "%d", &fsckpid) == 1) { + /* + * Once we have set the restricted PID, do not + * persist "fsckpid" in the options list. + */ + vfs_deleteopt(mp->mnt_optnew, "fsckpid"); + vfs_deleteopt(mp->mnt_opt, "fsckpid"); + if (mp->mnt_flag & MNT_UPDATE) { + if (VFSTOUFS(mp)->um_fs->fs_ronly == 0 && + vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) { + printf("Checker enable: Must be read-only\n"); + return (EINVAL); + } + } else if (vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0) == 0) { + printf("Checker enable: Must be read-only\n"); + return (EINVAL); + } + /* Set to -1 if we are done */ + if (fsckpid == 0) + fsckpid = -1; + } + if (vfs_getopt(mp->mnt_optnew, "nfsv4acls", NULL, NULL) == 0) { if (mntorflags & MNT_ACLS) { printf("WARNING: \"acls\" and \"nfsv4acls\" " @@ -204,6 +228,20 @@ ffs_mount(struct mount *mp) ump = VFSTOUFS(mp); fs = ump->um_fs; devvp = ump->um_devvp; + if (fsckpid == -1 && ump->um_fsckpid > 0) { + if ((error = ffs_flushfiles(mp, WRITECLOSE, td)) != 0 || + (error = ffs_sbupdate(ump, MNT_WAIT, 0)) != 0) + return (error); + DROP_GIANT(); + g_topology_lock(); + /* + * Return to normal read-only mode. + */ + error = g_access(ump->um_cp, 0, -1, 0); + g_topology_unlock(); + PICKUP_GIANT(); + ump->um_fsckpid = 0; + } if (fs->fs_ronly == 0 && vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* @@ -295,6 +333,13 @@ ffs_mount(struct mount *mp) if (fs->fs_ronly && !vfs_flagopt(mp->mnt_optnew, "ro", NULL, 0)) { /* + * If we are running a checker, do not allow upgrade. + */ + if (ump->um_fsckpid > 0) { + printf("Active checker, cannot rw upgrade\n"); + return (EINVAL); + } + /* * If upgrade to read-write by non-root, then verify * that user has necessary permissions on the device. */ @@ -388,6 +433,39 @@ ffs_mount(struct mount *mp) mp->mnt_flag |= MNT_NFS4ACLS; MNT_IUNLOCK(mp); } + /* + * If this is a request from fsck to clean up the filesystem, + * then allow the specified pid to proceed. + */ + if (fsckpid > 0) { + if (ump->um_fsckpid != 0) { + printf("Active checker already running on %s\n", + fs->fs_fsmnt); + return (EINVAL); + } + KASSERT((mp->mnt_flag & MNT_SOFTDEP) == 0, + ("soft updates enabled on read-only file system")); + DROP_GIANT(); + g_topology_lock(); + /* + * Request write access. + */ + error = g_access(ump->um_cp, 0, 1, 0); + g_topology_unlock(); + PICKUP_GIANT(); + if (error) { + printf("Checker activation failed on %s\n", + fs->fs_fsmnt); + return (error); + } + ump->um_fsckpid = fsckpid; + if (fs->fs_snapinum[0] != 0) + ffs_snapshot_mount(mp); + fs->fs_mtime = time_second; + fs->fs_fmod = 1; + fs->fs_clean = 0; + (void) ffs_sbupdate(ump, MNT_WAIT, 0); + } /* * If this is a snapshot request, take the snapshot. @@ -451,6 +529,31 @@ ffs_mount(struct mount *mp) vrele(devvp); return (error); } + if (fsckpid > 0) { + KASSERT((mp->mnt_flag & MNT_SOFTDEP) == 0, + ("soft updates enabled on read-only file system")); + ump = VFSTOUFS(mp); + fs = ump->um_fs; + DROP_GIANT(); + g_topology_lock(); + /* + * Request write access. + */ + error = g_access(ump->um_cp, 0, 1, 0); + g_topology_unlock(); + PICKUP_GIANT(); + if (error) { + printf("Checker activation failed on %s\n", + fs->fs_fsmnt); + } else { + ump->um_fsckpid = fsckpid; + if (fs->fs_snapinum[0] != 0) + ffs_snapshot_mount(mp); + fs->fs_mtime = time_second; + fs->fs_clean = 0; + (void) ffs_sbupdate(ump, MNT_WAIT, 0); + } + } } vfs_mountedfrom(mp, fspec); return (0); @@ -1161,7 +1264,7 @@ ffs_unmount(mp, mntflags) } UFS_UNLOCK(ump); softdep_unmount(mp); - if (fs->fs_ronly == 0) { + if (fs->fs_ronly == 0 || ump->um_fsckpid > 0) { fs->fs_clean = fs->fs_flags & (FS_UNCLEAN|FS_NEEDSFSCK) ? 0 : 1; error = ffs_sbupdate(ump, MNT_WAIT, 0); if (error && error != ENXIO) { @@ -1175,6 +1278,13 @@ ffs_unmount(mp, mntflags) } DROP_GIANT(); g_topology_lock(); + if (ump->um_fsckpid > 0) { + /* + * Return to normal read-only mode. + */ + error = g_access(ump->um_cp, 0, -1, 0); + ump->um_fsckpid = 0; + } g_vfs_close(ump->um_cp); g_topology_unlock(); PICKUP_GIANT(); @@ -1323,7 +1433,7 @@ ffs_sync(mp, waitfor) td = curthread; fs = ump->um_fs; - if (fs->fs_fmod != 0 && fs->fs_ronly != 0) { /* XXX */ + if (fs->fs_fmod != 0 && fs->fs_ronly != 0 && ump->um_fsckpid == 0) { printf("fs = %s\n", fs->fs_fsmnt); panic("ffs_sync: rofs mod"); } @@ -1681,12 +1791,12 @@ ffs_uninit(vfsp) * Write a superblock and associated information back to disk. */ int -ffs_sbupdate(mp, waitfor, suspended) - struct ufsmount *mp; +ffs_sbupdate(ump, waitfor, suspended) + struct ufsmount *ump; int waitfor; int suspended; { - struct fs *fs = mp->um_fs; + struct fs *fs = ump->um_fs; struct buf *sbbp; struct buf *bp; int blks; @@ -1694,14 +1804,14 @@ ffs_sbupdate(mp, waitfor, suspended) int i, size, error, allerror = 0; if (fs->fs_ronly == 1 && - (mp->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) != - (MNT_RDONLY | MNT_UPDATE)) + (ump->um_mountp->mnt_flag & (MNT_RDONLY | MNT_UPDATE)) != + (MNT_RDONLY | MNT_UPDATE) && ump->um_fsckpid == 0) panic("ffs_sbupdate: write read-only filesystem"); /* * We use the superblock's buf to serialize calls to ffs_sbupdate(). */ - sbbp = getblk(mp->um_devvp, btodb(fs->fs_sblockloc), (int)fs->fs_sbsize, - 0, 0, 0); + sbbp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc), + (int)fs->fs_sbsize, 0, 0, 0); /* * First write back the summary information. */ @@ -1711,7 +1821,7 @@ ffs_sbupdate(mp, waitfor, suspended) size = fs->fs_bsize; if (i + fs->fs_frag > blks) size = (blks - i) * fs->fs_fsize; - bp = getblk(mp->um_devvp, fsbtodb(fs, fs->fs_csaddr + i), + bp = getblk(ump->um_devvp, fsbtodb(fs, fs->fs_csaddr + i), size, 0, 0, 0); bcopy(space, bp->b_data, (u_int)size); space = (char *)space + size; @@ -1747,9 +1857,9 @@ ffs_sbupdate(mp, waitfor, suspended) fs->fs_fmod = 0; fs->fs_time = time_second; if (fs->fs_flags & FS_DOSOFTDEP) - softdep_setup_sbupdate(mp, (struct fs *)bp->b_data, bp); + softdep_setup_sbupdate(ump, (struct fs *)bp->b_data, bp); bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize); - ffs_oldfscompat_write((struct fs *)bp->b_data, mp); + ffs_oldfscompat_write((struct fs *)bp->b_data, ump); if (suspended) bp->b_flags |= B_VALIDSUSPWRT; if (waitfor != MNT_WAIT) Modified: head/sys/ufs/ffs/fs.h ============================================================================== --- head/sys/ufs/ffs/fs.h Fri Jul 15 15:56:23 2011 (r224060) +++ head/sys/ufs/ffs/fs.h Fri Jul 15 16:20:33 2011 (r224061) @@ -214,7 +214,9 @@ #define FFS_SET_CWD 12 /* set current directory */ #define FFS_SET_DOTDOT 13 /* set inode number for ".." */ #define FFS_UNLINK 14 /* remove a name in the filesystem */ -#define FFS_MAXID 15 /* number of valid ffs ids */ +#define FFS_SET_INODE 15 /* update an on-disk inode */ +#define FFS_SET_BUFOUTPUT 16 /* set buffered writing on descriptor */ +#define FFS_MAXID 16 /* number of valid ffs ids */ /* * Command structure passed in to the filesystem to adjust filesystem values. Modified: head/sys/ufs/ufs/ufsmount.h ============================================================================== --- head/sys/ufs/ufs/ufsmount.h Fri Jul 15 15:56:23 2011 (r224060) +++ head/sys/ufs/ufs/ufsmount.h Fri Jul 15 16:20:33 2011 (r224061) @@ -77,6 +77,7 @@ struct ufsmount { u_long um_bptrtodb; /* indir ptr to disk block */ u_long um_seqinc; /* inc between seq blocks */ struct mtx um_lock; /* Protects ufsmount & fs */ + pid_t um_fsckpid; /* PID permitted fsck sysctls */ long um_numindirdeps; /* outstanding indirdeps */ struct workhead softdep_workitem_pending; /* softdep work queue */ struct worklist *softdep_worklist_tail; /* Tail pointer for above */
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201107151620.p6FGKX98086893>