From owner-freebsd-hackers@FreeBSD.ORG Wed Apr 1 18:38:46 2009 Return-Path: Delivered-To: freebsd-hackers@FreeBSD.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id B021E1065743 for ; Wed, 1 Apr 2009 18:38:46 +0000 (UTC) (envelope-from trasz@FreeBSD.org) Received: from pin.if.uz.zgora.pl (pin.if.uz.zgora.pl [212.109.128.251]) by mx1.freebsd.org (Postfix) with ESMTP id F17028FC1F for ; Wed, 1 Apr 2009 18:38:45 +0000 (UTC) (envelope-from trasz@FreeBSD.org) Received: by pin.if.uz.zgora.pl (Postfix, from userid 1001) id 8CCFB39BA2; Wed, 1 Apr 2009 20:25:02 +0200 (CEST) Date: Wed, 1 Apr 2009 20:25:02 +0200 From: Edward Tomasz Napierala To: freebsd-hackers@FreeBSD.org Message-ID: <20090401182502.GA13651@pin.if.uz.zgora.pl> MIME-Version: 1.0 Content-Type: text/plain; charset=iso-8859-2 Content-Disposition: inline User-Agent: Mutt/1.5.18 (2008-05-17) Cc: Subject: Filesystem orphaning. X-BeenThere: freebsd-hackers@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: Technical Discussions relating to FreeBSD List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 01 Apr 2009 18:38:47 -0000 Attached is a patch that adds filesystem orphaning. What it means, from the user point of view, is that when a disk device containing mounted filesystem gets removed from the system, all filesystem operations return immediately with an error, contents of the filesystem become invisible and inaccessible, and the filesystem gets marked as 'orphaned' in the mount(8) output. The only thing you can do after that is to unmount the filesystem. There are two problems. First, there is a race condition between registering the callback from the mount routine and device disappearing, causing the callback to be called. I'm not sure how serious it is; vfs_orphan() routine contains code to prevent it from messing things up when called against a filesystem that hasn't been fully mounted yet. Second problem is that vflush(9) with FORCECLOSE isn't quite safe. Right now the only situation it's being called is "umount -f"; this patch would add a second case. I'm little short of time, so I won't be able to work on it anytime soon. If you like the idea - please do whatever is needed to get it to commitable state. This patch was developed during project sponsored by FreeBSD Foundation. Index: sbin/mount/mount.c =================================================================== --- sbin/mount/mount.c (revision 190561) +++ sbin/mount/mount.c (working copy) @@ -112,6 +112,7 @@ static struct opt { { MNT_MULTILABEL, "multilabel" }, { MNT_ACLS, "acls" }, { MNT_GJOURNAL, "gjournal" }, + { MNT_ORPHANED, "orphaned" }, { 0, NULL } }; Index: sys/ufs/ffs/ffs_vfsops.c =================================================================== --- sys/ufs/ffs/ffs_vfsops.c (revision 190561) +++ sys/ufs/ffs/ffs_vfsops.c (working copy) @@ -601,6 +601,15 @@ loop: */ static int sblock_try[] = SBLOCKSEARCH; +static void +ffs_orphan_callback(struct g_consumer *cp, void *user) +{ + struct mount *mp; + + mp = (struct mount *)user; + vfs_orphan(mp); +} + /* * Common code for mount and mountroot */ @@ -629,9 +638,13 @@ ffs_mountfs(devvp, mp, td) dev = devvp->v_rdev; dev_ref(dev); + vfs_ref(mp); DROP_GIANT(); g_topology_lock(); error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1); + if (error == 0) + g_vfs_register_callback(cp, ffs_orphan_callback, + mp, G_CB_ORPHAN); /* * If we are a root mount, drop the E flag so fsck can do its magic. @@ -923,6 +936,7 @@ out: free(ump, M_UFSMNT); mp->mnt_data = NULL; } + vfs_rel(mp); dev_rel(dev); return (error); } @@ -1110,6 +1124,7 @@ ffs_unmount(mp, mntflags, td) g_topology_unlock(); PICKUP_GIANT(); vrele(ump->um_devvp); + vfs_rel(mp); dev_rel(ump->um_dev); mtx_destroy(UFS_MTX(ump)); if (mp->mnt_gjprovider != NULL) { Index: sys/kern/vfs_syscalls.c =================================================================== --- sys/kern/vfs_syscalls.c (revision 190561) +++ sys/kern/vfs_syscalls.c (working copy) @@ -326,6 +326,8 @@ kern_statfs(struct thread *td, char *path, enum ui sp->f_version = STATFS_VERSION; sp->f_namemax = NAME_MAX; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + if (mp->mnt_kern_flag & MNTK_ORPHANED) + sp->f_flags |= MNT_ORPHANED; error = VFS_STATFS(mp, sp, td); if (error) goto out; @@ -415,6 +417,8 @@ kern_fstatfs(struct thread *td, int fd, struct sta sp->f_version = STATFS_VERSION; sp->f_namemax = NAME_MAX; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + if (mp->mnt_kern_flag & MNTK_ORPHANED) + sp->f_flags |= MNT_ORPHANED; error = VFS_STATFS(mp, sp, td); if (error) goto out; @@ -515,6 +519,8 @@ kern_getfsstat(struct thread *td, struct statfs ** sp->f_version = STATFS_VERSION; sp->f_namemax = NAME_MAX; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + if (mp->mnt_kern_flag & MNTK_ORPHANED) + sp->f_flags |= MNT_ORPHANED; /* * If MNT_NOWAIT or MNT_LAZY is specified, do not * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY @@ -4662,6 +4668,8 @@ kern_fhstatfs(struct thread *td, fhandle_t fh, str sp->f_version = STATFS_VERSION; sp->f_namemax = NAME_MAX; sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + if (mp->mnt_kern_flag & MNTK_ORPHANED) + sp->f_flags |= MNT_ORPHANED; error = VFS_STATFS(mp, sp, td); if (error == 0) *buf = *sp; Index: sys/kern/vfs_subr.c =================================================================== --- sys/kern/vfs_subr.c (revision 190561) +++ sys/kern/vfs_subr.c (working copy) @@ -1084,7 +1084,7 @@ insmntque1(struct vnode *vp, struct mount *mp, #endif MNT_ILOCK(mp); if ((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 && - ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 || + ((mp->mnt_kern_flag & (MNTK_UNMOUNTF | MNTK_ORPHANED)) != 0 || mp->mnt_nvnodelistsize == 0)) { locked = VOP_ISLOCKED(vp); if (!locked || (locked == LK_EXCLUSIVE && @@ -1092,6 +1092,8 @@ insmntque1(struct vnode *vp, struct mount *mp, MNT_IUNLOCK(mp); if (dtr != NULL) dtr(vp, dtr_arg); + if ((mp->mnt_kern_flag & MNTK_ORPHANED) != 0) + return (ENXIO); return (EBUSY); } } @@ -2875,6 +2877,7 @@ DB_SHOW_COMMAND(mount, db_show_mount) MNT_KERN_FLAG(MNTK_MPSAFE); MNT_KERN_FLAG(MNTK_NOKNOTE); MNT_KERN_FLAG(MNTK_LOOKUP_SHARED); + MNT_KERN_FLAG(MNTK_ORPHANED); #undef MNT_KERN_FLAG if (flags != 0) { if (buf[0] != '\0') @@ -4249,6 +4252,45 @@ vfs_read_dirent(struct vop_readdir_args *ap, struc } /* + * Mark the filesystem as orphaned. Usually called when the device + * that contained the filesystem goes away. + */ +void +vfs_orphan(struct mount *mp) +{ + int error; + struct mount *tmp; + + error = vfs_busy(mp, MBF_NOWAIT); + /* If the filesystem is being unmounted, do nothing. */ + if (error) + return; + + /* Prevent all future vnode operations from succeeding. */ + MNT_ILOCK(mp); + mp->mnt_kern_flag |= (MNTK_ORPHANED | MNTK_NOINSMNTQ); + MNT_IUNLOCK(mp); + + /* + * Don't try to call vflush on a mount structure that is not + * fully initialized yet. Assume that the mount is initialized + * if it can be found on the mountlist. + */ + mtx_lock(&mountlist_mtx); + TAILQ_FOREACH(tmp, &mountlist, mnt_list) { + if (tmp == mp) + break; + } + mtx_unlock(&mountlist_mtx); + if (tmp == NULL) { + vfs_unbusy(mp); + return; + } + vflush(mp, 0, FORCECLOSE, curthread); + vfs_unbusy(mp); +} + +/* * Mark for update the access time of the file if the filesystem * supports VOP_MARKATIME. This functionality is used by execve and * mmap, so we want to avoid the I/O implied by directly setting Index: sys/fs/msdosfs/msdosfs_vfsops.c =================================================================== --- sys/fs/msdosfs/msdosfs_vfsops.c (revision 190561) +++ sys/fs/msdosfs/msdosfs_vfsops.c (working copy) @@ -403,6 +403,15 @@ msdosfs_mount(struct mount *mp, struct thread *td) return (0); } +static void +msdosfs_orphan_callback(struct g_consumer *cp, void *user) +{ + struct mount *mp; + + mp = (struct mount *)user; + vfs_orphan(mp); +} + static int mountmsdosfs(struct vnode *devvp, struct mount *mp) { @@ -425,9 +434,13 @@ mountmsdosfs(struct vnode *devvp, struct mount *mp dev = devvp->v_rdev; dev_ref(dev); + vfs_ref(mp); DROP_GIANT(); g_topology_lock(); error = g_vfs_open(devvp, &cp, "msdosfs", ronly ? 0 : 1); + if (error == 0) + g_vfs_register_callback(cp, msdosfs_orphan_callback, + mp, G_CB_ORPHAN); g_topology_unlock(); PICKUP_GIANT(); VOP_UNLOCK(devvp, 0); @@ -766,6 +779,7 @@ error_exit: free(pmp, M_MSDOSFSMNT); mp->mnt_data = NULL; } + vfs_rel(mp); dev_rel(dev); return (error); } @@ -831,6 +845,7 @@ msdosfs_unmount(struct mount *mp, int mntflags, st g_topology_unlock(); PICKUP_GIANT(); vrele(pmp->pm_devvp); + vfs_rel(mp); dev_rel(pmp->pm_dev); free(pmp->pm_inusemap, M_MSDOSFSFAT); if (pmp->pm_flags & MSDOSFS_LARGEFS) Index: sys/geom/geom_vfs.c =================================================================== --- sys/geom/geom_vfs.c (revision 190561) +++ sys/geom/geom_vfs.c (working copy) @@ -34,6 +34,8 @@ __FBSDID("$FreeBSD$"); #include #include #include /* XXX Temporary for VFS_LOCK_GIANT */ +#include +#include #include #include @@ -130,17 +132,78 @@ g_vfs_strategy(struct bufobj *bo, struct buf *bp) g_io_request(bip, cp); } +struct g_vfs_cb { + struct g_consumer *cb_consumer; + int cb_event; + void (*cb_callback)(struct g_consumer *, void *); + void *cb_userptr; + struct task cb_task; +}; + +/* + * When registering the callback from the mount routine, the topology lock + * is being taken while holding devvp vnode lock. The callback routine + * would probably try to grab devvp vnode lock, and executing it from + * g_event context, while holding topology lock, would cause LOR. To make + * sure this doesn't happen, we call the callback from taskqueue. + */ static void +g_vfs_cb_func(void *context, int pending) +{ + struct g_vfs_cb *cb; + + cb = context; + + KASSERT(cb->cb_event == G_CB_ORPHAN, + ("found callback for unknown event")); + + (cb->cb_callback)(cb->cb_consumer, cb->cb_userptr); +} + +void +g_vfs_register_callback(struct g_consumer *cp, + void (callback)(struct g_consumer *, void *), void *userptr, int event) +{ + struct g_vfs_cb *cb; + + g_topology_assert(); + + KASSERT(event >= 0 && event <= G_CB_LAST, + ("invalid callback event flag")); + cb = cp->private; + KASSERT(cb[event].cb_callback == NULL, + ("callback already registered")); + + cb[event].cb_callback = callback; + cb[event].cb_userptr = userptr; + cb[event].cb_consumer = cp; + cb[event].cb_event = event; + TASK_INIT(&(cb[event].cb_task), 0, g_vfs_cb_func, &(cb[event])); +} + +static void g_vfs_orphan(struct g_consumer *cp) { struct g_geom *gp; struct bufobj *bo; + struct g_vfs_cb *cb; + int error; g_topology_assert(); gp = cp->geom; bo = gp->softc; + cb = cp->private; + g_trace(G_T_TOPOLOGY, "g_vfs_orphan(%p(%s))", cp, gp->name); + + if (cb != NULL && cb[G_CB_ORPHAN].cb_callback != NULL) { + error = taskqueue_enqueue(taskqueue_thread, + &(cb[G_CB_ORPHAN].cb_task)); + KASSERT(error == 0, ("taskqueue_enqueue(9) failed.")); + taskqueue_drain(taskqueue_thread, &(cb[G_CB_ORPHAN].cb_task)); + } + if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) g_access(cp, -cp->acr, -cp->acw, -cp->ace); g_detach(cp); @@ -169,6 +232,8 @@ g_vfs_open(struct vnode *vp, struct g_consumer **c gp = g_new_geomf(&g_vfs_class, "%s.%s", fsname, pp->name); cp = g_new_consumer(gp); g_attach(cp, pp); + cp->private = g_malloc(sizeof(struct g_vfs_cb[G_CB_LAST + 1]), + M_WAITOK | M_ZERO); error = g_access(cp, 1, wr, 1); if (error) { g_wither_geom(gp, ENXIO); @@ -195,6 +260,8 @@ g_vfs_close(struct g_consumer *cp) g_topology_assert(); + g_free(cp->private); + cp->private = NULL; gp = cp->geom; bo = gp->softc; bufobj_invalbuf(bo, V_SAVE, 0, 0); Index: sys/geom/geom_vfs.h =================================================================== --- sys/geom/geom_vfs.h (revision 190561) +++ sys/geom/geom_vfs.h (working copy) @@ -35,8 +35,13 @@ struct buf; extern struct buf_ops *g_vfs_bufops; +#define G_CB_ORPHAN 1 +#define G_CB_LAST G_CB_ORPHAN + void g_vfs_strategy(struct bufobj *bo, struct buf *bp); int g_vfs_open(struct vnode *vp, struct g_consumer **cpp, const char *fsname, int wr); void g_vfs_close(struct g_consumer *cp); +void g_vfs_register_callback(struct g_consumer *cp, + void (callback)(struct g_consumer *, void *), void *user, int event); #endif /* _GEOM_GEOM_VFS_H_ */ Index: sys/sys/mount.h =================================================================== --- sys/sys/mount.h (revision 190561) +++ sys/sys/mount.h (working copy) @@ -250,14 +250,17 @@ void __mnt_vnode_markerfree(struct vnode #define MNT_EXPUBLIC 0x20000000 /* public export (WebNFS) */ /* - * Flags set by internal operations, - * but visible to the user. - * XXX some of these are not quite right.. (I've never seen the root flag set) + * Flags set by internal operations, but visible to the user. + * Note that MNT_ORPHANED flag is never actually set on mnt_flag field + * in struct mount; it's only set on f_flags in struct statfs when + * MNTK_ORPHANED is set. We cannot use MNT_ORPHANED instead of MNTK_ORPHANED + * due to missing locking of mnt_flag. */ #define MNT_LOCAL 0x00001000 /* filesystem is stored locally */ #define MNT_QUOTA 0x00002000 /* quotas are enabled on filesystem */ #define MNT_ROOTFS 0x00004000 /* identifies the root filesystem */ #define MNT_USER 0x00008000 /* mounted by a user */ +#define MNT_ORPHANED 0x00020000 /* MNTK_ORPHANED is set */ #define MNT_IGNORE 0x00800000 /* do not show entry in df */ /* @@ -273,7 +276,8 @@ void __mnt_vnode_markerfree(struct vnode MNT_ROOTFS | MNT_NOATIME | MNT_NOCLUSTERR| \ MNT_NOCLUSTERW | MNT_SUIDDIR | MNT_SOFTDEP | \ MNT_IGNORE | MNT_EXPUBLIC | MNT_NOSYMFOLLOW | \ - MNT_GJOURNAL | MNT_MULTILABEL | MNT_ACLS) + MNT_GJOURNAL | MNT_MULTILABEL | MNT_ACLS | \ + MNT_ORPHANED) /* Mask of flags that can be updated. */ #define MNT_UPDATEMASK (MNT_NOSUID | MNT_NOEXEC | \ @@ -289,6 +293,8 @@ void __mnt_vnode_markerfree(struct vnode * XXX: These are not STATES and really should be somewhere else. * XXX: MNT_BYFSID collides with MNT_ACLS, but because MNT_ACLS is only used for * mount(2) and MNT_BYFSID is only used for unmount(2) it's harmless. + * XXX: MNT_DELEXPORT collides with MNT_ORPHANED, but MNT_DELEXPORT is never + * used in mnt_flag, only for ex_flags. */ #define MNT_UPDATE 0x00010000 /* not a real mount, just an update */ #define MNT_DELEXPORT 0x00020000 /* delete export host lists */ @@ -325,6 +331,7 @@ void __mnt_vnode_markerfree(struct vnode #define MNTK_DRAINING 0x00000010 /* lock draining is happening */ #define MNTK_REFEXPIRE 0x00000020 /* refcount expiring is happening */ #define MNTK_EXTENDED_SHARED 0x00000040 /* Allow shared locking for more ops */ +#define MNTK_ORPHANED 0x00000080 /* device is gone */ #define MNTK_UNMOUNT 0x01000000 /* unmount in progress */ #define MNTK_MWAIT 0x02000000 /* waiting for unmount to finish */ #define MNTK_SUSPEND 0x08000000 /* request write suspension */ @@ -747,6 +754,7 @@ struct mount *vfs_mount_alloc(struct vnode *, stru int vfs_suser(struct mount *, struct thread *); void vfs_unbusy(struct mount *); void vfs_unmountall(void); +void vfs_orphan(struct mount *); extern TAILQ_HEAD(mntlist, mount) mountlist; /* mounted filesystem list */ extern struct mtx mountlist_mtx; extern struct nfs_public nfs_pub; -- If you cut off my head, what would I say? Me and my head, or me and my body?