Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 09 Jan 2012 18:37:52 +0200
From:      Mikolaj Golub <trociny@freebsd.org>
To:        arch@freebsd.org
Cc:        Robert Watson <rwatson@FreeBSD.org>, Kostik Belousov <kib@FreeBSD.org>
Subject:   unix domain sockets on nullfs(5)
Message-ID:  <86sjjobzmn.fsf@kopusha.home.net>

next in thread | raw e-mail | index | archive | help
--=-=-=

Hi,

There is a longstanding problem with nullfs(5) that is unix sockets do
not work between lower and upper layers.

See, e.g. kern/51583, kern/159663.

On a unix socket binding the created socket is referenced in the vnode
field v_socket. This field is used on connect (from the vnode returned
by lookup). Unix socket functions like unp_bind/connect set/access
this field directly.

This is the issue for nullfs, which uses two-layer vnode approach:
binding to the upper layer, the socket reference is stored in the
upper vnode; binding to the lower fs, the socket reference is stored
in the lower vnode and is not seen from the upper layer.

E.g. having /mnt/upper nullfs mounted on /mnt/lower:

1) if we bind to /mnt/lower/test.sock we can connect only to
/mnt/lower/test.sock.

2) if we bind to /mnt/upper/test.sock we can connect only to
/mnt/upper/test.sock.

The desired behavior is one can connect to both the lower and the
upper paths regardless if we bind to /mnt/lower/test.sock or
/mnt/upeer/test.sock.

In kern/159663 two approaches were discussed:

1) copy the socket pointer from lower vnode to upper vnode on the
upper vnode get  (fix the case when one binds to the lower fs and wants
to connect via the upper, but does not fix the case when one binds to
the upper and wants to connect via the lower fs);

2) make null_lookup/create return lower vnode for VSOCK vnodes.

Both approaches have issues and looks rather hackish.

kib@ suggested that the issue could be fixed if one added new VOP_*
operations for setting and accessing vnode's v_socket field.

The attached patch implements this. It also can be found here:

http://people.freebsd.org/~trociny/nullfs.VOP_UNP.4.patch

It adds three VOP_* operations: VOP_UNPBIND, VOP_UNPCONNECT and
VOP_UNPDETACH. Their purpose can be understood from the modifications
in uipc_usrreq.c:

-	vp->v_socket = unp->unp_socket;
+	VOP_UNPBIND(vp, unp->unp_socket);

-	so2 = vp->v_socket;
+	VOP_UNPCONNECT(vp, &so2);

-	unp->unp_vnode->v_socket = NULL;
+	VOP_UNPDETACH(unp->unp_vnode);

The default functions just do these simple operations, while
filesystems like nullfs can do more complicated things.

The patch also implements functions for nullfs. By default the old
behavior is preserved. To get the new behaviour the filesystem should
be (re)mounted with sobypass option. Then the socket operations are
bypassed to a lower vnode, which makes the socket be accessible from
both layers.

I am very interested to hear other people opinion on this.

-- 
Mikolaj Golub


--=-=-=
Content-Type: text/x-patch
Content-Disposition: inline; filename=nullfs.VOP_UNP.4.patch

Index: sys/sys/vnode.h
===================================================================
--- sys/sys/vnode.h	(revision 229701)
+++ sys/sys/vnode.h	(working copy)
@@ -695,6 +695,9 @@ int	vop_stdpathconf(struct vop_pathconf_args *);
 int	vop_stdpoll(struct vop_poll_args *);
 int	vop_stdvptocnp(struct vop_vptocnp_args *ap);
 int	vop_stdvptofh(struct vop_vptofh_args *ap);
+int	vop_stdunpbind(struct vop_unpbind_args *ap);
+int	vop_stdunpconnect(struct vop_unpconnect_args *ap);
+int	vop_stdunpdetach(struct vop_unpdetach_args *ap);
 int	vop_eopnotsupp(struct vop_generic_args *ap);
 int	vop_ebadf(struct vop_generic_args *ap);
 int	vop_einval(struct vop_generic_args *ap);
Index: sys/kern/uipc_usrreq.c
===================================================================
--- sys/kern/uipc_usrreq.c	(revision 229701)
+++ sys/kern/uipc_usrreq.c	(working copy)
@@ -542,7 +542,7 @@ restart:
 
 	UNP_LINK_WLOCK();
 	UNP_PCB_LOCK(unp);
-	vp->v_socket = unp->unp_socket;
+	VOP_UNPBIND(vp, unp->unp_socket);
 	unp->unp_vnode = vp;
 	unp->unp_addr = soun;
 	unp->unp_flags &= ~UNP_BINDING;
@@ -638,7 +638,7 @@ uipc_detach(struct socket *so)
 	 * XXXRW: Should assert vp->v_socket == so.
 	 */
 	if ((vp = unp->unp_vnode) != NULL) {
-		unp->unp_vnode->v_socket = NULL;
+		VOP_UNPDETACH(vp);
 		unp->unp_vnode = NULL;
 	}
 	unp2 = unp->unp_conn;
@@ -1308,7 +1308,7 @@ unp_connect(struct socket *so, struct sockaddr *na
 	 * and to protect simultaneous locking of multiple pcbs.
 	 */
 	UNP_LINK_WLOCK();
-	so2 = vp->v_socket;
+	VOP_UNPCONNECT(vp, &so2);
 	if (so2 == NULL) {
 		error = ECONNREFUSED;
 		goto bad2;
Index: sys/kern/vfs_default.c
===================================================================
--- sys/kern/vfs_default.c	(revision 229701)
+++ sys/kern/vfs_default.c	(working copy)
@@ -123,6 +123,9 @@ struct vop_vector default_vnodeops = {
 	.vop_unlock =		vop_stdunlock,
 	.vop_vptocnp =		vop_stdvptocnp,
 	.vop_vptofh =		vop_stdvptofh,
+	.vop_unpbind =		vop_stdunpbind,
+	.vop_unpconnect =	vop_stdunpconnect,
+	.vop_unpdetach =	vop_stdunpdetach,
 };
 
 /*
@@ -1037,6 +1040,39 @@ vop_stdadvise(struct vop_advise_args *ap)
 	return (error);
 }
 
+int
+vop_stdunpbind(struct vop_unpbind_args *ap)
+{
+	struct vnode *vp;
+
+	vp = ap->a_vp;
+
+	vp->v_socket = ap->a_socket;
+	return (0);
+}
+
+int
+vop_stdunpconnect(struct vop_unpconnect_args *ap)
+{
+	struct vnode *vp;
+
+	vp = ap->a_vp;
+
+	*ap->a_socket = vp->v_socket;
+	return (0);
+}
+
+int
+vop_stdunpdetach(struct vop_unpdetach_args *ap)
+{
+	struct vnode *vp;
+
+	vp = ap->a_vp;
+
+	vp->v_socket = NULL;
+	return (0);
+}
+
 /*
  * vfs default ops
  * used to fill the vfs function table to get reasonable default return values.
Index: sys/kern/vnode_if.src
===================================================================
--- sys/kern/vnode_if.src	(revision 229701)
+++ sys/kern/vnode_if.src	(working copy)
@@ -639,3 +639,23 @@ vop_advise {
 	IN off_t end;
 	IN int advice;
 };
+
+%% unpbind	vp	E E E
+
+vop_unpbind {
+	IN struct vnode *vp;
+	IN struct socket *socket;
+};
+
+%% unpconnect	vp	E E E
+
+vop_unpconnect {
+	IN struct vnode *vp;
+	OUT struct socket **socket;
+};
+
+%% unpdetach	vp	E E E
+
+vop_unpdetach {
+	IN struct vnode *vp;
+};
Index: sys/fs/nullfs/null.h
===================================================================
--- sys/fs/nullfs/null.h	(revision 229701)
+++ sys/fs/nullfs/null.h	(working copy)
@@ -37,8 +37,15 @@
 struct null_mount {
 	struct mount	*nullm_vfs;
 	struct vnode	*nullm_rootvp;	/* Reference to root null_node */
+	uint64_t	nullm_flags;	/* nullfs options specific for mount */
 };
 
+/*
+ * Flags stored in nullm_flags.
+ */
+#define	NULLMNT_SOBYPASS	0x00000001	/* Bypass unix socket operations
+						   to lower vnode */
+
 #ifdef _KERNEL
 /*
  * A cache of vnode references
@@ -47,8 +54,16 @@ struct null_node {
 	LIST_ENTRY(null_node)	null_hash;	/* Hash list */
 	struct vnode	        *null_lowervp;	/* VREFed once */
 	struct vnode		*null_vnode;	/* Back pointer */
+	u_int			null_flags;	/* Flags */
 };
 
+/*
+ * Flags stored in null_flags.
+ */
+
+#define	NULL_SOBYPASS	0x00000001	/* Bypass unix socket operations
+					   to lower vnode */
+
 #define	MOUNTTONULLMOUNT(mp) ((struct null_mount *)((mp)->mnt_data))
 #define	VTONULL(vp) ((struct null_node *)(vp)->v_data)
 #define	NULLTOV(xp) ((xp)->null_vnode)
Index: sys/fs/nullfs/null_vnops.c
===================================================================
--- sys/fs/nullfs/null_vnops.c	(revision 229701)
+++ sys/fs/nullfs/null_vnops.c	(working copy)
@@ -812,6 +812,52 @@ null_vptocnp(struct vop_vptocnp_args *ap)
 	return (error);
 }
 
+static int
+null_unpbind(struct vop_unpbind_args *ap)
+{
+	struct vnode *vp;
+	struct null_node *xp;
+	struct null_mount *xmp;
+
+	vp = ap->a_vp;
+	xp = VTONULL(vp);
+	xmp = MOUNTTONULLMOUNT(vp->v_mount);
+	if (xmp->nullm_flags & NULLMNT_SOBYPASS) {
+		xp->null_flags |= NULL_SOBYPASS;
+		return (null_bypass((struct vop_generic_args *)ap));
+	} else {
+		return (vop_stdunpbind(ap));
+	}
+}
+
+static int
+null_unpconnect(struct vop_unpconnect_args *ap)
+{
+	struct vnode *vp;
+	struct null_mount *xmp;
+
+	vp = ap->a_vp;
+	xmp = MOUNTTONULLMOUNT(vp->v_mount);
+	if (xmp->nullm_flags & NULLMNT_SOBYPASS)
+		return (null_bypass((struct vop_generic_args *)ap));
+	else
+		return (vop_stdunpconnect(ap));
+}
+
+static int
+null_unpdetach(struct vop_unpdetach_args *ap)
+{
+	struct vnode *vp;
+	struct null_node *xp;
+
+	vp = ap->a_vp;
+	xp = VTONULL(vp);
+	if (xp->null_flags & NULL_SOBYPASS)
+		return (null_bypass((struct vop_generic_args *)ap));
+	else
+		return (vop_stdunpdetach(ap));
+}
+
 /*
  * Global vfs data structures
  */
@@ -837,4 +883,7 @@ struct vop_vector null_vnodeops = {
 	.vop_unlock =		null_unlock,
 	.vop_vptocnp =		null_vptocnp,
 	.vop_vptofh =		null_vptofh,
+	.vop_unpbind =		null_unpbind,
+	.vop_unpconnect =	null_unpconnect,
+	.vop_unpdetach =	null_unpdetach,
 };
Index: sys/fs/nullfs/null_subr.c
===================================================================
--- sys/fs/nullfs/null_subr.c	(revision 229701)
+++ sys/fs/nullfs/null_subr.c	(working copy)
@@ -235,6 +235,7 @@ null_nodeget(mp, lowervp, vpp)
 
 	xp->null_vnode = vp;
 	xp->null_lowervp = lowervp;
+	xp->null_flags = 0;
 	vp->v_type = lowervp->v_type;
 	vp->v_data = xp;
 	vp->v_vnlock = lowervp->v_vnlock;
Index: sys/fs/nullfs/null_vfsops.c
===================================================================
--- sys/fs/nullfs/null_vfsops.c	(revision 229701)
+++ sys/fs/nullfs/null_vfsops.c	(working copy)
@@ -84,16 +84,26 @@ nullfs_mount(struct mount *mp)
 	if (mp->mnt_flag & MNT_ROOTFS)
 		return (EOPNOTSUPP);
 	/*
-	 * Update is a no-op
+	 * Update is supported only for some options.
 	 */
 	if (mp->mnt_flag & MNT_UPDATE) {
-		/*
-		 * Only support update mounts for NFS export.
-		 */
+		error = EOPNOTSUPP;
+		xmp = MOUNTTONULLMOUNT(mp);
+		if (vfs_flagopt(mp->mnt_optnew, "sobypass", NULL, 0)) {
+			MNT_ILOCK(mp);
+			xmp->nullm_flags |= NULLMNT_SOBYPASS;
+			MNT_IUNLOCK(mp);
+			error = 0;
+		}
+		if (vfs_flagopt(mp->mnt_optnew, "nosobypass", NULL, 0)) {
+			MNT_ILOCK(mp);
+			xmp->nullm_flags &= ~NULLMNT_SOBYPASS;
+			MNT_IUNLOCK(mp);
+			error = 0;
+		}
 		if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0))
-			return (0);
-		else
-			return (EOPNOTSUPP);
+			error = 0;
+		return (error);
 	}
 
 	/*
@@ -182,6 +192,11 @@ nullfs_mount(struct mount *mp)
 	MNT_ILOCK(mp);
 	mp->mnt_kern_flag |= lowerrootvp->v_mount->mnt_kern_flag & MNTK_MPSAFE;
 	MNT_IUNLOCK(mp);
+
+	xmp->nullm_flags = 0;
+	vfs_flagopt(mp->mnt_optnew, "sobypass", &xmp->nullm_flags,
+	    NULLMNT_SOBYPASS);
+
 	mp->mnt_data =  xmp;
 	vfs_getnewfsid(mp);
 
Index: sbin/mount_nullfs/mount_nullfs.c
===================================================================
--- sbin/mount_nullfs/mount_nullfs.c	(revision 229701)
+++ sbin/mount_nullfs/mount_nullfs.c	(working copy)
@@ -57,27 +57,36 @@ static const char rcsid[] =
 
 #include "mntopts.h"
 
+#define NULLOPT_SOBYPASS	0x00000001
+#define NULLOPT_MASK		(NULLOPT_SOBYPASS)
+
 static struct mntopt mopts[] = {
 	MOPT_STDOPTS,
+	MOPT_UPDATE,
+	{"sobypass", 0, NULLOPT_SOBYPASS, 1},
 	MOPT_END
 };
 
+static char fstype[] = "nullfs";
+
 int	subdir(const char *, const char *);
 static void	usage(void) __dead2;
 
 int
 main(int argc, char *argv[])
 {
-	struct iovec iov[6];
-	int ch, mntflags;
+	struct iovec *iov;
+	int ch, iovlen, mntflags, nullflags, negflags;
 	char source[MAXPATHLEN];
 	char target[MAXPATHLEN];
 
-	mntflags = 0;
+	mntflags = nullflags = 0;
+	negflags = NULLOPT_MASK;
 	while ((ch = getopt(argc, argv, "o:")) != -1)
 		switch(ch) {
 		case 'o':
-			getmntopts(optarg, mopts, &mntflags, 0);
+			getmntopts(optarg, mopts, &mntflags, &nullflags);
+			getmntopts(optarg, mopts, &mntflags, &negflags);
 			break;
 		case '?':
 		default:
@@ -97,20 +106,18 @@ main(int argc, char *argv[])
 		errx(EX_USAGE, "%s (%s) and %s are not distinct paths",
 		    argv[0], target, argv[1]);
 
-	iov[0].iov_base = strdup("fstype");
-	iov[0].iov_len = sizeof("fstype");
-	iov[1].iov_base = strdup("nullfs");
-	iov[1].iov_len = strlen(iov[1].iov_base) + 1;
-	iov[2].iov_base = strdup("fspath");
-	iov[2].iov_len = sizeof("fspath");
-	iov[3].iov_base = source;
-	iov[3].iov_len = strlen(source) + 1;
-	iov[4].iov_base = strdup("target");
-	iov[4].iov_len = sizeof("target");
-	iov[5].iov_base = target;
-	iov[5].iov_len = strlen(target) + 1;
-
-	if (nmount(iov, 6, mntflags))
+	iov = NULL;
+	iovlen = 0;
+	build_iovec(&iov, &iovlen, "fstype", fstype, (size_t)-1);
+	build_iovec(&iov, &iovlen, "fspath", source, (size_t)-1);
+	build_iovec(&iov, &iovlen, "target", target, (size_t)-1);
+	if ((nullflags & NULLOPT_SOBYPASS) != 0)
+		build_iovec(&iov, &iovlen, "sobypass", NULL, 0);
+	if ((mntflags & MNT_UPDATE) != 0) {
+		if ((negflags & NULLOPT_SOBYPASS) == 0)
+			build_iovec(&iov, &iovlen, "nosobypass", NULL, 0);
+	}
+	if (nmount(iov, iovlen, mntflags))
 		err(1, NULL);
 	exit(0);
 }
Index: sbin/mount_nullfs/mount_nullfs.8
===================================================================
--- sbin/mount_nullfs/mount_nullfs.8	(revision 229701)
+++ sbin/mount_nullfs/mount_nullfs.8	(working copy)
@@ -79,8 +79,14 @@ Options are specified with a
 flag followed by a comma separated string of options.
 See the
 .Xr mount 8
-man page for possible options and their meanings.
+man page for standard options and their meanings.
+Options specific for
+.Nm :
+.Bl -tag -width sobypass
+.It Cm sobypass
+Bypass unix socket operations to the lower layer.
 .El
+.El
 .Pp
 The null layer has two purposes.
 First, it serves as a demonstration of layering by providing a layer

--=-=-=--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?86sjjobzmn.fsf>