Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 2 Nov 2016 12:43:16 +0000 (UTC)
From:      Konstantin Belousov <kib@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r308212 - in head/sys: fs/nfsserver kern sys
Message-ID:  <201611021243.uA2ChGIg003295@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: kib
Date: Wed Nov  2 12:43:15 2016
New Revision: 308212
URL: https://svnweb.freebsd.org/changeset/base/308212

Log:
  Allow some dotdot lookups in capability mode.
  
  If dotdot lookup does not escape from the file descriptor passed as
  the lookup root, we can allow the component traversal.  Track the
  directories traversed, and check the result of dotdot lookup against
  the recorded list of the directory vnodes.
  
  Dotdot lookups are enabled by sysctl vfs.lookup_cap_dotdot, currently
  disabled by default until more verification of the approach is done.
  
  Disallow non-local filesystems for dotdot, since remote server might
  conspire with the local process to allow it to escape the namespace.
  This might be too cautious, provide the knob
  vfs.lookup_cap_dotdot_nonlocal to override as well.
  
  Idea by:	rwatson
  Discussed with:	emaste, jonathan, rwatson
  Reviewed by:	mjg (previous version)
  Tested by:	pho (previous version)
  Sponsored by:	The FreeBSD Foundation
  MFC after:	2 week
  Differential revision:	https://reviews.freebsd.org/D8110

Modified:
  head/sys/fs/nfsserver/nfs_nfsdport.c
  head/sys/kern/vfs_lookup.c
  head/sys/kern/vfs_syscalls.c
  head/sys/sys/namei.h

Modified: head/sys/fs/nfsserver/nfs_nfsdport.c
==============================================================================
--- head/sys/fs/nfsserver/nfs_nfsdport.c	Wed Nov  2 12:10:39 2016	(r308211)
+++ head/sys/fs/nfsserver/nfs_nfsdport.c	Wed Nov  2 12:43:15 2016	(r308212)
@@ -350,7 +350,7 @@ nfsvno_namei(struct nfsrv_descript *nd, 
 
 	*retdirp = NULL;
 	cnp->cn_nameptr = cnp->cn_pnbuf;
-	ndp->ni_strictrelative = 0;
+	ndp->ni_lcf = 0;
 	/*
 	 * Extract and set starting directory.
 	 */

Modified: head/sys/kern/vfs_lookup.c
==============================================================================
--- head/sys/kern/vfs_lookup.c	Wed Nov  2 12:10:39 2016	(r308211)
+++ head/sys/kern/vfs_lookup.c	Wed Nov  2 12:43:15 2016	(r308212)
@@ -79,12 +79,22 @@ uma_zone_t namei_zone;
 /* Placeholder vnode for mp traversal. */
 static struct vnode *vp_crossmp;
 
+struct nameicap_tracker {
+	struct vnode *dp;
+	TAILQ_ENTRY(nameicap_tracker) nm_link;
+};
+
+/* Zone for cap mode tracker elements used for dotdot capability checks. */
+static uma_zone_t nt_zone;
+
 static void
 nameiinit(void *dummy __unused)
 {
 
 	namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, 0);
+	nt_zone = uma_zcreate("rentr", sizeof(struct nameicap_tracker),
+	    NULL, NULL, NULL, NULL, sizeof(void *), 0);
 	getnewvnode("crossmp", NULL, &dead_vnodeops, &vp_crossmp);
 	vn_lock(vp_crossmp, LK_EXCLUSIVE);
 	VN_LOCK_ASHARE(vp_crossmp);
@@ -96,6 +106,76 @@ static int lookup_shared = 1;
 SYSCTL_INT(_vfs, OID_AUTO, lookup_shared, CTLFLAG_RWTUN, &lookup_shared, 0,
     "enables shared locks for path name translation");
 
+/*
+ * Intent is that lookup_cap_dotdot becomes unconditionally enabled,
+ * but it defaults to the disabled state until verification efforts
+ * are complete.
+ */
+static int lookup_cap_dotdot = 0;
+SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot, CTLFLAG_RWTUN,
+    &lookup_cap_dotdot, 0,
+    "enables \"..\" components in path lookup in capability mode");
+static int lookup_cap_dotdot_nonlocal = 0;
+SYSCTL_INT(_vfs, OID_AUTO, lookup_cap_dotdot_nonlocal, CTLFLAG_RWTUN,
+    &lookup_cap_dotdot_nonlocal, 0,
+    "enables \"..\" components in path lookup in capability mode "
+    "on non-local mount");
+
+static void
+nameicap_tracker_add(struct nameidata *ndp, struct vnode *dp)
+{
+	struct nameicap_tracker *nt;
+
+	if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp->v_type != VDIR)
+		return;
+	nt = uma_zalloc(nt_zone, M_WAITOK);
+	vhold(dp);
+	nt->dp = dp;
+	TAILQ_INSERT_TAIL(&ndp->ni_cap_tracker, nt, nm_link);
+}
+
+static void
+nameicap_cleanup(struct nameidata *ndp)
+{
+	struct nameicap_tracker *nt, *nt1;
+
+	KASSERT(TAILQ_EMPTY(&ndp->ni_cap_tracker) ||
+	    (ndp->ni_lcf & NI_LCF_CAP_DOTDOT) != 0, ("not strictrelative"));
+	TAILQ_FOREACH_SAFE(nt, &ndp->ni_cap_tracker, nm_link, nt1) {
+		TAILQ_REMOVE(&ndp->ni_cap_tracker, nt, nm_link);
+		vdrop(nt->dp);
+		uma_zfree(nt_zone, nt);
+	}
+}
+
+/*
+ * For dotdot lookups in capability mode, only allow the component
+ * lookup to succeed if the resulting directory was already traversed
+ * during the operation.  Also fail dotdot lookups for non-local
+ * filesystems, where external agents might assist local lookups to
+ * escape the compartment.
+ */
+static int
+nameicap_check_dotdot(struct nameidata *ndp, struct vnode *dp)
+{
+	struct nameicap_tracker *nt;
+	struct mount *mp;
+
+	if ((ndp->ni_lcf & NI_LCF_CAP_DOTDOT) == 0 || dp == NULL ||
+	    dp->v_type != VDIR)
+		return (0);
+	mp = dp->v_mount;
+	if (lookup_cap_dotdot_nonlocal == 0 && mp != NULL &&
+	    (mp->mnt_flag & MNT_LOCAL) == 0)
+		return (ENOTCAPABLE);
+	TAILQ_FOREACH_REVERSE(nt, &ndp->ni_cap_tracker, nameicap_tracker_head,
+	    nm_link) {
+		if (dp == nt->dp)
+			return (0);
+	}
+	return (ENOTCAPABLE);
+}
+
 static void
 namei_cleanup_cnp(struct componentname *cnp)
 {
@@ -113,7 +193,7 @@ namei_handle_root(struct nameidata *ndp,
 	struct componentname *cnp;
 
 	cnp = &ndp->ni_cnd;
-	if (ndp->ni_strictrelative != 0) {
+	if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0) {
 #ifdef KTRACE
 		if (KTRPOINT(curthread, KTR_CAPFAIL))
 			ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
@@ -177,6 +257,8 @@ namei(struct nameidata *ndp)
 	if (!lookup_shared)
 		cnp->cn_flags &= ~LOCKSHARED;
 	fdp = p->p_fd;
+	TAILQ_INIT(&ndp->ni_cap_tracker);
+	ndp->ni_lcf = 0;
 
 	/* We will set this ourselves if we need it. */
 	cnp->cn_flags &= ~TRAILINGSLASH;
@@ -202,13 +284,21 @@ namei(struct nameidata *ndp)
 
 #ifdef CAPABILITY_MODE
 	/*
-	 * In capability mode, lookups must be "strictly relative" (i.e.
-	 * not an absolute path, and not containing '..' components) to
-	 * a real file descriptor, not the pseudo-descriptor AT_FDCWD.
+	 * In capability mode, lookups must be restricted to happen in
+	 * the subtree with the root specified by the file descriptor:
+	 * - The root must be real file descriptor, not the pseudo-descriptor
+	 *   AT_FDCWD.
+	 * - The passed path must be relative and not absolute.
+	 * - If lookup_cap_dotdot is disabled, path must not contain the
+	 *   '..' components.
+	 * - If lookup_cap_dotdot is enabled, we verify that all '..'
+	 *   components lookups result in the directories which were
+	 *   previously walked by us, which prevents an escape from
+	 *   the relative root.
 	 */
 	if (error == 0 && IN_CAPABILITY_MODE(td) &&
 	    (cnp->cn_flags & NOCAPCHECK) == 0) {
-		ndp->ni_strictrelative = 1;
+		ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
 		if (ndp->ni_dirfd == AT_FDCWD) {
 #ifdef KTRACE
 			if (KTRPOINT(td, KTR_CAPFAIL))
@@ -282,7 +372,7 @@ namei(struct nameidata *ndp)
 			    &rights) ||
 			    ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
 			    ndp->ni_filecaps.fc_nioctls != -1) {
-				ndp->ni_strictrelative = 1;
+				ndp->ni_lcf |= NI_LCF_STRICTRELATIVE;
 			}
 #endif
 		}
@@ -297,6 +387,9 @@ namei(struct nameidata *ndp)
 			vrele(dp);
 		goto out;
 	}
+	if ((ndp->ni_lcf & NI_LCF_STRICTRELATIVE) != 0 &&
+	    lookup_cap_dotdot != 0)
+		ndp->ni_lcf |= NI_LCF_CAP_DOTDOT;
 	SDT_PROBE3(vfs, namei, lookup, entry, dp, cnp->cn_pnbuf,
 	    cnp->cn_flags);
 	for (;;) {
@@ -313,7 +406,7 @@ namei(struct nameidata *ndp)
 				namei_cleanup_cnp(cnp);
 			} else
 				cnp->cn_flags |= HASBUF;
-
+			nameicap_cleanup(ndp);
 			SDT_PROBE2(vfs, namei, lookup, return, 0, ndp->ni_vp);
 			return (0);
 		}
@@ -387,6 +480,7 @@ namei(struct nameidata *ndp)
 out:
 	vrele(ndp->ni_rootdir);
 	namei_cleanup_cnp(cnp);
+	nameicap_cleanup(ndp);
 	SDT_PROBE2(vfs, namei, lookup, return, error, NULL);
 	return (error);
 }
@@ -583,6 +677,8 @@ dirloop:
 		goto bad;
 	}
 
+	nameicap_tracker_add(ndp, dp);
+
 	/*
 	 * Check for degenerate name (e.g. / or "")
 	 * which is a way of talking about a directory,
@@ -618,9 +714,8 @@ dirloop:
 
 	/*
 	 * Handle "..": five special cases.
-	 * 0. If doing a capability lookup, return ENOTCAPABLE (this is a
-	 *    fairly conservative design choice, but it's the only one that we
-	 *    are satisfied guarantees the property we're looking for).
+	 * 0. If doing a capability lookup and lookup_cap_dotdot is
+	 *    disabled, return ENOTCAPABLE.
 	 * 1. Return an error if this is the last component of
 	 *    the name and the operation is DELETE or RENAME.
 	 * 2. If at root directory (e.g. after chroot)
@@ -632,9 +727,15 @@ dirloop:
 	 *    .. in the other filesystem.
 	 * 4. If the vnode is the top directory of
 	 *    the jail or chroot, don't let them out.
+	 * 5. If doing a capability lookup and lookup_cap_dotdot is
+	 *    enabled, return ENOTCAPABLE if the lookup would escape
+	 *    from the initial file descriptor directory.  Checks are
+	 *    done by ensuring that namei() already traversed the
+	 *    result of dotdot lookup.
 	 */
 	if (cnp->cn_flags & ISDOTDOT) {
-		if (ndp->ni_strictrelative != 0) {
+		if ((ndp->ni_lcf & (NI_LCF_STRICTRELATIVE | NI_LCF_CAP_DOTDOT))
+		    == NI_LCF_STRICTRELATIVE) {
 #ifdef KTRACE
 			if (KTRPOINT(curthread, KTR_CAPFAIL))
 				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
@@ -676,6 +777,14 @@ dirloop:
 			vn_lock(dp,
 			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
 			    LK_RETRY, ISDOTDOT));
+			error = nameicap_check_dotdot(ndp, dp);
+			if (error != 0) {
+#ifdef KTRACE
+				if (KTRPOINT(curthread, KTR_CAPFAIL))
+					ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
+#endif
+				goto bad;
+			}
 		}
 	}
 
@@ -735,6 +844,7 @@ unionlookup:
 			vn_lock(dp,
 			    compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
 			    LK_RETRY, cnp->cn_flags));
+			nameicap_tracker_add(ndp, dp);
 			goto unionlookup;
 		}
 
@@ -855,6 +965,16 @@ nextname:
 			vrele(ndp->ni_dvp);
 		goto dirloop;
 	}
+	if (cnp->cn_flags & ISDOTDOT) {
+		error = nameicap_check_dotdot(ndp, ndp->ni_vp);
+		if (error != 0) {
+#ifdef KTRACE
+			if (KTRPOINT(curthread, KTR_CAPFAIL))
+				ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
+#endif
+			goto bad2;
+		}
+	}
 	if (*ndp->ni_next == '/') {
 		cnp->cn_nameptr = ndp->ni_next;
 		while (*cnp->cn_nameptr == '/') {
@@ -1081,7 +1201,6 @@ NDINIT_ALL(struct nameidata *ndp, u_long
 	ndp->ni_dirp = namep;
 	ndp->ni_dirfd = dirfd;
 	ndp->ni_startdir = startdir;
-	ndp->ni_strictrelative = 0;
 	if (rightsp != NULL)
 		ndp->ni_rightsneeded = *rightsp;
 	else

Modified: head/sys/kern/vfs_syscalls.c
==============================================================================
--- head/sys/kern/vfs_syscalls.c	Wed Nov  2 12:10:39 2016	(r308211)
+++ head/sys/kern/vfs_syscalls.c	Wed Nov  2 12:43:15 2016	(r308212)
@@ -1030,7 +1030,7 @@ kern_openat(struct thread *td, int fd, c
 		 * understand exactly what would happen, and we don't think
 		 * that it ever should.
 		 */
-		if (nd.ni_strictrelative == 0 &&
+		if ((nd.ni_lcf & NI_LCF_STRICTRELATIVE) == 0 &&
 		    (error == ENODEV || error == ENXIO) &&
 		    td->td_dupfd >= 0) {
 			error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
@@ -1076,7 +1076,7 @@ success:
 		struct filecaps *fcaps;
 
 #ifdef CAPABILITIES
-		if (nd.ni_strictrelative == 1)
+		if ((nd.ni_lcf & NI_LCF_STRICTRELATIVE) != 0)
 			fcaps = &nd.ni_filecaps;
 		else
 #endif

Modified: head/sys/sys/namei.h
==============================================================================
--- head/sys/sys/namei.h	Wed Nov  2 12:10:39 2016	(r308211)
+++ head/sys/sys/namei.h	Wed Nov  2 12:43:15 2016	(r308212)
@@ -55,6 +55,9 @@ struct componentname {
 	long	cn_namelen;	/* length of looked up component */
 };
 
+struct nameicap_tracker;
+TAILQ_HEAD(nameicap_tracker_head, nameicap_tracker);
+
 /*
  * Encapsulation of namei parameters.
  */
@@ -72,7 +75,7 @@ struct nameidata {
 	struct	vnode *ni_rootdir;	/* logical root directory */
 	struct	vnode *ni_topdir;	/* logical top directory */
 	int	ni_dirfd;		/* starting directory for *at functions */
-	int	ni_strictrelative;	/* relative lookup only; no '..' */
+	int	ni_lcf;			/* local call flags */
 	/*
 	 * Results: returned from namei
 	 */
@@ -94,6 +97,7 @@ struct nameidata {
 	 * through the VOP interface.
 	 */
 	struct componentname ni_cnd;
+	struct nameicap_tracker_head ni_cap_tracker;
 };
 
 #ifdef _KERNEL
@@ -152,6 +156,12 @@ struct nameidata {
 #define	PARAMASK	0x3ffffe00 /* mask of parameter descriptors */
 
 /*
+ * Flags in ni_lcf, valid for the duration of the namei call.
+ */
+#define	NI_LCF_STRICTRELATIVE	0x0001	/* relative lookup only */
+#define	NI_LCF_CAP_DOTDOT	0x0002	/* ".." in strictrelative case */
+
+/*
  * Initialization of a nameidata structure.
  */
 #define	NDINIT(ndp, op, flags, segflg, namep, td)			\



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201611021243.uA2ChGIg003295>