From owner-freebsd-hackers@FreeBSD.ORG  Wed Apr  1 18:38:46 2009
Return-Path: <owner-freebsd-hackers@FreeBSD.ORG>
Delivered-To: freebsd-hackers@FreeBSD.org
Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34])
	by hub.freebsd.org (Postfix) with ESMTP id B021E1065743
	for <freebsd-hackers@FreeBSD.org>; Wed,  1 Apr 2009 18:38:46 +0000 (UTC)
	(envelope-from trasz@FreeBSD.org)
Received: from pin.if.uz.zgora.pl (pin.if.uz.zgora.pl [212.109.128.251])
	by mx1.freebsd.org (Postfix) with ESMTP id F17028FC1F
	for <freebsd-hackers@FreeBSD.org>; Wed,  1 Apr 2009 18:38:45 +0000 (UTC)
	(envelope-from trasz@FreeBSD.org)
Received: by pin.if.uz.zgora.pl (Postfix, from userid 1001)
	id 8CCFB39BA2; Wed,  1 Apr 2009 20:25:02 +0200 (CEST)
Date: Wed, 1 Apr 2009 20:25:02 +0200
From: Edward Tomasz Napierala <trasz@FreeBSD.org>
To: freebsd-hackers@FreeBSD.org
Message-ID: <20090401182502.GA13651@pin.if.uz.zgora.pl>
MIME-Version: 1.0
Content-Type: text/plain; charset=iso-8859-2
Content-Disposition: inline
User-Agent: Mutt/1.5.18 (2008-05-17)
Cc: 
Subject: Filesystem orphaning.
X-BeenThere: freebsd-hackers@freebsd.org
X-Mailman-Version: 2.1.5
Precedence: list
List-Id: Technical Discussions relating to FreeBSD
	<freebsd-hackers.freebsd.org>
List-Unsubscribe: <http://lists.freebsd.org/mailman/listinfo/freebsd-hackers>, 
	<mailto:freebsd-hackers-request@freebsd.org?subject=unsubscribe>
List-Archive: <http://lists.freebsd.org/pipermail/freebsd-hackers>
List-Post: <mailto:freebsd-hackers@freebsd.org>
List-Help: <mailto:freebsd-hackers-request@freebsd.org?subject=help>
List-Subscribe: <http://lists.freebsd.org/mailman/listinfo/freebsd-hackers>,
	<mailto:freebsd-hackers-request@freebsd.org?subject=subscribe>
X-List-Received-Date: Wed, 01 Apr 2009 18:38:47 -0000

Attached is a patch that adds filesystem orphaning.  What it means,
from the user point of view, is that when a disk device containing mounted
filesystem gets removed from the system, all filesystem operations return
immediately with an error, contents of the filesystem become invisible
and inaccessible, and the filesystem gets marked as 'orphaned' in the mount(8)
output.  The only thing you can do after that is to unmount the filesystem.

There are two problems.  First, there is a race condition between registering
the callback from the mount routine and device disappearing, causing the
callback to be called.  I'm not sure how serious it is; vfs_orphan() routine
contains code to prevent it from messing things up when called against
a filesystem that hasn't been fully mounted yet.  Second problem is that
vflush(9) with FORCECLOSE isn't quite safe.  Right now the only situation
it's being called is "umount -f"; this patch would add a second case.

I'm little short of time, so I won't be able to work on it anytime soon.
If you like the idea - please do whatever is needed to get it to commitable
state.

This patch was developed during project sponsored by FreeBSD Foundation.

Index: sbin/mount/mount.c
===================================================================
--- sbin/mount/mount.c	(revision 190561)
+++ sbin/mount/mount.c	(working copy)
@@ -112,6 +112,7 @@ static struct opt {
 	{ MNT_MULTILABEL,	"multilabel" },
 	{ MNT_ACLS,		"acls" },
 	{ MNT_GJOURNAL,		"gjournal" },
+	{ MNT_ORPHANED,		"orphaned" },
 	{ 0, NULL }
 };
 
Index: sys/ufs/ffs/ffs_vfsops.c
===================================================================
--- sys/ufs/ffs/ffs_vfsops.c	(revision 190561)
+++ sys/ufs/ffs/ffs_vfsops.c	(working copy)
@@ -601,6 +601,15 @@ loop:
  */
 static int sblock_try[] = SBLOCKSEARCH;
 
+static void
+ffs_orphan_callback(struct g_consumer *cp, void *user)
+{
+	struct mount *mp;
+
+	mp = (struct mount *)user;
+	vfs_orphan(mp);
+}
+
 /*
  * Common code for mount and mountroot
  */
@@ -629,9 +638,13 @@ ffs_mountfs(devvp, mp, td)
 
 	dev = devvp->v_rdev;
 	dev_ref(dev);
+	vfs_ref(mp);
 	DROP_GIANT();
 	g_topology_lock();
 	error = g_vfs_open(devvp, &cp, "ffs", ronly ? 0 : 1);
+	if (error == 0)
+		g_vfs_register_callback(cp, ffs_orphan_callback,
+		    mp, G_CB_ORPHAN);
 
 	/*
 	 * If we are a root mount, drop the E flag so fsck can do its magic.
@@ -923,6 +936,7 @@ out:
 		free(ump, M_UFSMNT);
 		mp->mnt_data = NULL;
 	}
+	vfs_rel(mp);
 	dev_rel(dev);
 	return (error);
 }
@@ -1110,6 +1124,7 @@ ffs_unmount(mp, mntflags, td)
 	g_topology_unlock();
 	PICKUP_GIANT();
 	vrele(ump->um_devvp);
+	vfs_rel(mp);
 	dev_rel(ump->um_dev);
 	mtx_destroy(UFS_MTX(ump));
 	if (mp->mnt_gjprovider != NULL) {
Index: sys/kern/vfs_syscalls.c
===================================================================
--- sys/kern/vfs_syscalls.c	(revision 190561)
+++ sys/kern/vfs_syscalls.c	(working copy)
@@ -326,6 +326,8 @@ kern_statfs(struct thread *td, char *path, enum ui
 	sp->f_version = STATFS_VERSION;
 	sp->f_namemax = NAME_MAX;
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+	if (mp->mnt_kern_flag & MNTK_ORPHANED)
+		sp->f_flags |= MNT_ORPHANED;
 	error = VFS_STATFS(mp, sp, td);
 	if (error)
 		goto out;
@@ -415,6 +417,8 @@ kern_fstatfs(struct thread *td, int fd, struct sta
 	sp->f_version = STATFS_VERSION;
 	sp->f_namemax = NAME_MAX;
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+	if (mp->mnt_kern_flag & MNTK_ORPHANED)
+		sp->f_flags |= MNT_ORPHANED;
 	error = VFS_STATFS(mp, sp, td);
 	if (error)
 		goto out;
@@ -515,6 +519,8 @@ kern_getfsstat(struct thread *td, struct statfs **
 			sp->f_version = STATFS_VERSION;
 			sp->f_namemax = NAME_MAX;
 			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+			if (mp->mnt_kern_flag & MNTK_ORPHANED)
+				sp->f_flags |= MNT_ORPHANED;
 			/*
 			 * If MNT_NOWAIT or MNT_LAZY is specified, do not
 			 * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY
@@ -4662,6 +4668,8 @@ kern_fhstatfs(struct thread *td, fhandle_t fh, str
 	sp->f_version = STATFS_VERSION;
 	sp->f_namemax = NAME_MAX;
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+	if (mp->mnt_kern_flag & MNTK_ORPHANED)
+		sp->f_flags |= MNT_ORPHANED;
 	error = VFS_STATFS(mp, sp, td);
 	if (error == 0)
 		*buf = *sp;
Index: sys/kern/vfs_subr.c
===================================================================
--- sys/kern/vfs_subr.c	(revision 190561)
+++ sys/kern/vfs_subr.c	(working copy)
@@ -1084,7 +1084,7 @@ insmntque1(struct vnode *vp, struct mount *mp,
 #endif
 	MNT_ILOCK(mp);
 	if ((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
-	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
+	    ((mp->mnt_kern_flag & (MNTK_UNMOUNTF | MNTK_ORPHANED)) != 0 ||
 	     mp->mnt_nvnodelistsize == 0)) {
 		locked = VOP_ISLOCKED(vp);
 		if (!locked || (locked == LK_EXCLUSIVE &&
@@ -1092,6 +1092,8 @@ insmntque1(struct vnode *vp, struct mount *mp,
 			MNT_IUNLOCK(mp);
 			if (dtr != NULL)
 				dtr(vp, dtr_arg);
+			if ((mp->mnt_kern_flag & MNTK_ORPHANED) != 0)
+				return (ENXIO);
 			return (EBUSY);
 		}
 	}
@@ -2875,6 +2877,7 @@ DB_SHOW_COMMAND(mount, db_show_mount)
 	MNT_KERN_FLAG(MNTK_MPSAFE);
 	MNT_KERN_FLAG(MNTK_NOKNOTE);
 	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
+	MNT_KERN_FLAG(MNTK_ORPHANED);
 #undef MNT_KERN_FLAG
 	if (flags != 0) {
 		if (buf[0] != '\0')
@@ -4249,6 +4252,45 @@ vfs_read_dirent(struct vop_readdir_args *ap, struc
 }
 
 /*
+ * Mark the filesystem as orphaned.  Usually called when the device
+ * that contained the filesystem goes away.
+ */
+void
+vfs_orphan(struct mount *mp)
+{
+	int error;
+	struct mount *tmp;
+
+	error = vfs_busy(mp, MBF_NOWAIT);
+	/* If the filesystem is being unmounted, do nothing. */
+	if (error)
+		return;
+
+	/* Prevent all future vnode operations from succeeding. */
+	MNT_ILOCK(mp);
+	mp->mnt_kern_flag |= (MNTK_ORPHANED | MNTK_NOINSMNTQ);
+	MNT_IUNLOCK(mp);
+
+	/*
+	 * Don't try to call vflush on a mount structure that is not
+	 * fully initialized yet.  Assume that the mount is initialized
+	 * if it can be found on the mountlist.
+	 */
+	mtx_lock(&mountlist_mtx);
+	TAILQ_FOREACH(tmp, &mountlist, mnt_list) {
+		if (tmp == mp)
+			break;
+	}
+	mtx_unlock(&mountlist_mtx);
+	if (tmp == NULL) {
+		vfs_unbusy(mp);
+		return;
+	}
+	vflush(mp, 0, FORCECLOSE, curthread);
+	vfs_unbusy(mp);
+}
+
+/*
  * Mark for update the access time of the file if the filesystem
  * supports VOP_MARKATIME.  This functionality is used by execve and
  * mmap, so we want to avoid the I/O implied by directly setting
Index: sys/fs/msdosfs/msdosfs_vfsops.c
===================================================================
--- sys/fs/msdosfs/msdosfs_vfsops.c	(revision 190561)
+++ sys/fs/msdosfs/msdosfs_vfsops.c	(working copy)
@@ -403,6 +403,15 @@ msdosfs_mount(struct mount *mp, struct thread *td)
 	return (0);
 }
 
+static void
+msdosfs_orphan_callback(struct g_consumer *cp, void *user)
+{
+	struct mount *mp;
+
+	mp = (struct mount *)user;
+	vfs_orphan(mp);
+}
+
 static int
 mountmsdosfs(struct vnode *devvp, struct mount *mp)
 {
@@ -425,9 +434,13 @@ mountmsdosfs(struct vnode *devvp, struct mount *mp
 
 	dev = devvp->v_rdev;
 	dev_ref(dev);
+ 	vfs_ref(mp);
 	DROP_GIANT();
 	g_topology_lock();
 	error = g_vfs_open(devvp, &cp, "msdosfs", ronly ? 0 : 1);
+ 	if (error == 0)
+		g_vfs_register_callback(cp, msdosfs_orphan_callback,
+		    mp, G_CB_ORPHAN);
 	g_topology_unlock();
 	PICKUP_GIANT();
 	VOP_UNLOCK(devvp, 0);
@@ -766,6 +779,7 @@ error_exit:
 		free(pmp, M_MSDOSFSMNT);
 		mp->mnt_data = NULL;
 	}
+	vfs_rel(mp);
 	dev_rel(dev);
 	return (error);
 }
@@ -831,6 +845,7 @@ msdosfs_unmount(struct mount *mp, int mntflags, st
 	g_topology_unlock();
 	PICKUP_GIANT();
 	vrele(pmp->pm_devvp);
+	vfs_rel(mp);
 	dev_rel(pmp->pm_dev);
 	free(pmp->pm_inusemap, M_MSDOSFSFAT);
 	if (pmp->pm_flags & MSDOSFS_LARGEFS)
Index: sys/geom/geom_vfs.c
===================================================================
--- sys/geom/geom_vfs.c	(revision 190561)
+++ sys/geom/geom_vfs.c	(working copy)
@@ -34,6 +34,8 @@ __FBSDID("$FreeBSD$");
 #include <sys/malloc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>	/* XXX Temporary for VFS_LOCK_GIANT */
+#include <sys/queue.h>
+#include <sys/taskqueue.h>
 
 #include <geom/geom.h>
 #include <geom/geom_vfs.h>
@@ -130,17 +132,78 @@ g_vfs_strategy(struct bufobj *bo, struct buf *bp)
 	g_io_request(bip, cp);
 }
 
+struct g_vfs_cb {
+	struct g_consumer	*cb_consumer;
+	int			cb_event;
+	void			(*cb_callback)(struct g_consumer *, void *);
+	void			*cb_userptr;
+	struct task		cb_task;
+};
+
+/*
+ * When registering the callback from the mount routine, the topology lock
+ * is being taken while holding devvp vnode lock.  The callback routine
+ * would probably try to grab devvp vnode lock, and executing it from
+ * g_event context, while holding topology lock, would cause LOR.  To make
+ * sure this doesn't happen, we call the callback from taskqueue.
+ */
 static void
+g_vfs_cb_func(void *context, int pending)
+{
+	struct g_vfs_cb *cb;
+
+	cb = context;
+
+	KASSERT(cb->cb_event == G_CB_ORPHAN,
+	    ("found callback for unknown event"));
+	
+	(cb->cb_callback)(cb->cb_consumer, cb->cb_userptr);
+}
+
+void
+g_vfs_register_callback(struct g_consumer *cp,
+    void (callback)(struct g_consumer *, void *), void *userptr, int event)
+{
+	struct g_vfs_cb *cb;
+
+	g_topology_assert();
+
+	KASSERT(event >= 0 && event <= G_CB_LAST,
+	    ("invalid callback event flag"));
+	cb = cp->private;
+	KASSERT(cb[event].cb_callback == NULL,
+	    ("callback already registered"));
+
+	cb[event].cb_callback = callback;
+	cb[event].cb_userptr = userptr;
+	cb[event].cb_consumer = cp;
+	cb[event].cb_event = event;
+	TASK_INIT(&(cb[event].cb_task), 0, g_vfs_cb_func, &(cb[event]));
+}
+
+static void
 g_vfs_orphan(struct g_consumer *cp)
 {
 	struct g_geom *gp;
 	struct bufobj *bo;
+	struct g_vfs_cb *cb;
+	int error;
 
 	g_topology_assert();
 
 	gp = cp->geom;
 	bo = gp->softc;
+	cb = cp->private;
+
 	g_trace(G_T_TOPOLOGY, "g_vfs_orphan(%p(%s))", cp, gp->name);
+
+	if (cb != NULL && cb[G_CB_ORPHAN].cb_callback != NULL) {
+		error = taskqueue_enqueue(taskqueue_thread,
+		    &(cb[G_CB_ORPHAN].cb_task));
+		KASSERT(error == 0, ("taskqueue_enqueue(9) failed."));
+		taskqueue_drain(taskqueue_thread, &(cb[G_CB_ORPHAN].cb_task));
+	}
+
 	if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0)
 		g_access(cp, -cp->acr, -cp->acw, -cp->ace);
 	g_detach(cp);
@@ -169,6 +232,8 @@ g_vfs_open(struct vnode *vp, struct g_consumer **c
 	gp = g_new_geomf(&g_vfs_class, "%s.%s", fsname, pp->name);
 	cp = g_new_consumer(gp);
 	g_attach(cp, pp);
+	cp->private = g_malloc(sizeof(struct g_vfs_cb[G_CB_LAST + 1]),
+	    M_WAITOK | M_ZERO);
 	error = g_access(cp, 1, wr, 1);
 	if (error) {
 		g_wither_geom(gp, ENXIO);
@@ -195,6 +260,8 @@ g_vfs_close(struct g_consumer *cp)
 
 	g_topology_assert();
 
+	g_free(cp->private);
+	cp->private = NULL;
 	gp = cp->geom;
 	bo = gp->softc;
 	bufobj_invalbuf(bo, V_SAVE, 0, 0);
Index: sys/geom/geom_vfs.h
===================================================================
--- sys/geom/geom_vfs.h	(revision 190561)
+++ sys/geom/geom_vfs.h	(working copy)
@@ -35,8 +35,13 @@ struct buf;
 
 extern struct buf_ops *g_vfs_bufops;
 
+#define	G_CB_ORPHAN	1
+#define	G_CB_LAST	G_CB_ORPHAN
+
 void g_vfs_strategy(struct bufobj *bo, struct buf *bp);
 int g_vfs_open(struct vnode *vp, struct g_consumer **cpp, const char *fsname, int wr);
 void g_vfs_close(struct g_consumer *cp);
+void g_vfs_register_callback(struct g_consumer *cp,
+    void (callback)(struct g_consumer *, void *), void *user, int event);
 
 #endif /* _GEOM_GEOM_VFS_H_ */
Index: sys/sys/mount.h
===================================================================
--- sys/sys/mount.h	(revision 190561)
+++ sys/sys/mount.h	(working copy)
@@ -250,14 +250,17 @@ void          __mnt_vnode_markerfree(struct vnode
 #define	MNT_EXPUBLIC	0x20000000	/* public export (WebNFS) */
 
 /*
- * Flags set by internal operations,
- * but visible to the user.
- * XXX some of these are not quite right.. (I've never seen the root flag set)
+ * Flags set by internal operations, but visible to the user.
+ * Note that MNT_ORPHANED flag is never actually set on mnt_flag field
+ * in struct mount; it's only set on f_flags in struct statfs when
+ * MNTK_ORPHANED is set.  We cannot use MNT_ORPHANED instead of MNTK_ORPHANED
+ * due to missing locking of mnt_flag.
  */
 #define	MNT_LOCAL	0x00001000	/* filesystem is stored locally */
 #define	MNT_QUOTA	0x00002000	/* quotas are enabled on filesystem */
 #define	MNT_ROOTFS	0x00004000	/* identifies the root filesystem */
 #define	MNT_USER	0x00008000	/* mounted by a user */
+#define	MNT_ORPHANED	0x00020000	/* MNTK_ORPHANED is set */
 #define	MNT_IGNORE	0x00800000	/* do not show entry in df */
 
 /*
@@ -273,7 +276,8 @@ void          __mnt_vnode_markerfree(struct vnode
 			MNT_ROOTFS	| MNT_NOATIME	| MNT_NOCLUSTERR| \
 			MNT_NOCLUSTERW	| MNT_SUIDDIR	| MNT_SOFTDEP	| \
 			MNT_IGNORE	| MNT_EXPUBLIC	| MNT_NOSYMFOLLOW | \
-			MNT_GJOURNAL	| MNT_MULTILABEL | MNT_ACLS)
+			MNT_GJOURNAL	| MNT_MULTILABEL | MNT_ACLS | \
+			MNT_ORPHANED)
 
 /* Mask of flags that can be updated. */
 #define	MNT_UPDATEMASK (MNT_NOSUID	| MNT_NOEXEC	| \
@@ -289,6 +293,8 @@ void          __mnt_vnode_markerfree(struct vnode
  * XXX: These are not STATES and really should be somewhere else.
  * XXX: MNT_BYFSID collides with MNT_ACLS, but because MNT_ACLS is only used for
  *      mount(2) and MNT_BYFSID is only used for unmount(2) it's harmless.
+ * XXX: MNT_DELEXPORT collides with MNT_ORPHANED, but MNT_DELEXPORT is never
+ *      used in mnt_flag, only for ex_flags.
  */
 #define	MNT_UPDATE	0x00010000	/* not a real mount, just an update */
 #define	MNT_DELEXPORT	0x00020000	/* delete export host lists */
@@ -325,6 +331,7 @@ void          __mnt_vnode_markerfree(struct vnode
 #define	MNTK_DRAINING	0x00000010	/* lock draining is happening */
 #define	MNTK_REFEXPIRE	0x00000020	/* refcount expiring is happening */
 #define MNTK_EXTENDED_SHARED	0x00000040 /* Allow shared locking for more ops */
+#define	MNTK_ORPHANED	0x00000080	/* device is gone */
 #define MNTK_UNMOUNT	0x01000000	/* unmount in progress */
 #define	MNTK_MWAIT	0x02000000	/* waiting for unmount to finish */
 #define	MNTK_SUSPEND	0x08000000	/* request write suspension */
@@ -747,6 +754,7 @@ struct mount *vfs_mount_alloc(struct vnode *, stru
 int	vfs_suser(struct mount *, struct thread *);
 void	vfs_unbusy(struct mount *);
 void	vfs_unmountall(void);
+void	vfs_orphan(struct mount *);
 extern	TAILQ_HEAD(mntlist, mount) mountlist;	/* mounted filesystem list */
 extern	struct mtx mountlist_mtx;
 extern	struct nfs_public nfs_pub;

-- 
If you cut off my head, what would I say?  Me and my head, or me and my body?