Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 21 Jan 2012 10:12:57 +0200
From:      Kostik Belousov <kostikbel@gmail.com>
To:        John Baldwin <jhb@freebsd.org>
Cc:        Rick Macklem <rmacklem@freebsd.org>, fs@freebsd.org, Peter Wemm <peter@freebsd.org>
Subject:   Re: Race in NFS lookup can result in stale namecache entries
Message-ID:  <20120121081257.GS31224@deviant.kiev.zoral.com.ua>
In-Reply-To: <201201191117.28128.jhb@freebsd.org>
References:  <201201181707.21293.jhb@freebsd.org> <201201191026.09431.jhb@freebsd.org> <20120119160156.GF31224@deviant.kiev.zoral.com.ua> <201201191117.28128.jhb@freebsd.org>

next in thread | previous in thread | raw e-mail | index | archive | help

--WWZbIj0qUfl+CqYL
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable

On Thu, Jan 19, 2012 at 11:17:28AM -0500, John Baldwin wrote:
> On Thursday, January 19, 2012 11:01:56 am Kostik Belousov wrote:
> > On Thu, Jan 19, 2012 at 10:26:09AM -0500, John Baldwin wrote:
> > > On Thursday, January 19, 2012 9:06:13 am Kostik Belousov wrote:
> > > > On Wed, Jan 18, 2012 at 05:07:21PM -0500, John Baldwin wrote:
> > > > ...
> > > > > What I concluded is that it would really be far simpler and more
> > > > > obvious if the cached timestamps were stored in the namecache ent=
ry
> > > > > directly rather than having multiple name cache entries validated=
 by
> > > > > shared state in the nfsnode.  This does mean allowing the name ca=
che
> > > > > to hold some filesystem-specific state.  However, I felt this was=
 much
> > > > > cleaner than adding a lot more complexity to nfs_lookup().  Also,=
 this
> > > > > turns out to be fairly non-invasive to implement since nfs_lookup=
()
> > > > > calls cache_lookup() directly, but other filesystems only call it
> > > > > indirectly via vfs_cache_lookup().  I considered letting filesyst=
ems
> > > > > store a void * cookie in the name cache entry and having them pro=
vide
> > > > > a destructor, etc.  However, that would require extra allocations=
 for
> > > > > NFS lookups.  Instead, I just adjusted the name cache API to
> > > > > explicitly allow the filesystem to store a single timestamp in a =
name
> > > > > cache entry by adding a new 'cache_enter_time()' that accepts a s=
truct
> > > > > timespec that is copied into the entry.  'cache_enter_time()' also
> > > > > saves the current value of 'ticks' in the entry.  'cache_lookup()=
' is
> > > > > modified to add two new arguments used to return the timespec and
> > > > > ticks value used for a namecache entry when a hit in the cache oc=
curs.
> > > > >=20
> > > > > One wrinkle with this is that the name cache does not create actu=
al
> > > > > entries for ".", and thus it would not store any timestamps for t=
hose
> > > > > lookups.  To fix this I changed the NFS client to explicitly fast=
-path
> > > > > lookups of "." by always returning the current directory as setup=
 by
> > > > > cache_lookup() and never bothering to do a LOOKUP or check for st=
ale
> > > > > attributes in that case.
> > > > >=20
> > > > > The current patch against 8 is at
> > > > > http://www.FreeBSD.org/~jhb/patches/nfs_lookup.patch
> > > > ...
> > > >=20
> > > > So now you add 8*2+4 bytes to each namecache entry on amd64 uncondi=
tionally.
> > > > Current size of the struct namecache invariant part on amd64 is 72 =
bytes,
> > > > so addition of 20 bytes looks slightly excessive. I am not sure abo=
ut
> > > > typical distribution of the namecache nc_name length, so it is unob=
vious
> > > > does the change changes the memory usage significantly.
> > > >=20
> > > > A flag could be added to nc_flags to indicate the presence of times=
tamp.
> > > > The timestamps would be conditionally placed after nc_nlen, we prob=
ably
> > > > could use union to ease the access. Then, the direct dereferences of
> > > > nc_name would need to be converted to some inline function.
> > > >=20
> > > > I can do this after your patch is committed, if you consider the me=
mory
> > > > usage saving worth it.
> > >=20
> > > Hmm, if the memory usage really is worrying then I could move to usin=
g the
> > > void * cookie method instead.
> >=20
> > I think the current approach is better then cookie that again will be
> > used only for NFS. With the cookie, you still has 8 bytes for each ncp.
> > With union, you do not have the overhead for !NFS.
> >=20
> > Default setup allows for ~300000 vnodes on not too powerful amd64 machi=
ne,
> > the ncsizefactor 2 together with 8 bytes for cookie is 4.5MB. For 20 by=
tes
> > per ncp, we get 12MB overhead.
>=20
> Ok.  If you want to tackle the union bits I'm happy to let you do so.  Th=
at
> will at least break up the changes a bit.

Below is my take. First version of the patch added both small and large
zones with ts, but later I decided that large does not make sense.
If wanted, it can be restored easily.

diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
index aa269de..c11f25f 100644
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@@ -97,14 +97,36 @@ struct	namecache {
 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
 	struct	vnode *nc_dvp;		/* vnode of parent of name */
 	struct	vnode *nc_vp;		/* vnode the name refers to */
-	struct	timespec nc_time;	/* timespec provided by fs */
-	int	nc_ticks;		/* ticks value when entry was added */
 	u_char	nc_flag;		/* flag bits */
 	u_char	nc_nlen;		/* length of name */
 	char	nc_name[0];		/* segment name + nul */
 };
=20
 /*
+ * struct namecache_ts repeats struct namecache layout up to the
+ * nc_nlen member.
+ */
+struct	namecache_ts {
+	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
+	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
+	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
+	struct	vnode *nc_dvp;		/* vnode of parent of name */
+	struct	vnode *nc_vp;		/* vnode the name refers to */
+	u_char	nc_flag;		/* flag bits */
+	u_char	nc_nlen;		/* length of name */
+	struct	timespec nc_time;	/* timespec provided by fs */
+	int	nc_ticks;		/* ticks value when entry was added */
+	char	nc_name[0];		/* segment name + nul */
+};
+
+/*
+ * Flags in namecache.nc_flag
+ */
+#define NCF_WHITE	0x01
+#define NCF_ISDOTDOT	0x02
+#define	NCF_TS		0x04
+
+/*
  * Name caching works as follows:
  *
  * Names found by directory scans are retained in a cache
@@ -166,20 +188,50 @@ RW_SYSINIT(vfscache, &cache_lock, "Name Cache");
  * fit in the small cache.
  */
 static uma_zone_t cache_zone_small;
+static uma_zone_t cache_zone_small_ts;
 static uma_zone_t cache_zone_large;
=20
 #define	CACHE_PATH_CUTOFF	35
-#define	CACHE_ZONE_SMALL	(sizeof(struct namecache) + CACHE_PATH_CUTOFF \
-				    + 1)
-#define	CACHE_ZONE_LARGE	(sizeof(struct namecache) + NAME_MAX + 1)
-
-#define cache_alloc(len)	uma_zalloc(((len) <=3D CACHE_PATH_CUTOFF) ? \
-	cache_zone_small : cache_zone_large, M_WAITOK)
-#define cache_free(ncp)		do { \
-	if (ncp !=3D NULL) \
-		uma_zfree(((ncp)->nc_nlen <=3D CACHE_PATH_CUTOFF) ? \
-		    cache_zone_small : cache_zone_large, (ncp)); \
-} while (0)
+
+static struct namecache *
+cache_alloc(int len, int ts)
+{
+
+	if (len > CACHE_PATH_CUTOFF)
+		return (uma_zalloc(cache_zone_large, M_WAITOK));
+	if (ts)
+		return (uma_zalloc(cache_zone_small_ts, M_WAITOK));
+	else
+		return (uma_zalloc(cache_zone_small, M_WAITOK));
+}
+
+static void
+cache_free(struct namecache *ncp)
+{
+	int ts;
+
+	if (ncp =3D=3D NULL)
+		return;
+	ts =3D ncp->nc_flag & NCF_TS;
+	if (ncp->nc_nlen <=3D CACHE_PATH_CUTOFF) {
+		if (ts)
+			uma_zfree(cache_zone_small_ts, ncp);
+		else
+			uma_zfree(cache_zone_small, ncp);
+	} else
+		uma_zfree(cache_zone_large, ncp);
+}
+
+static char *
+nc_get_name(struct namecache *ncp)
+{
+	struct namecache_ts *ncp_ts;
+
+	if ((ncp->nc_flag & NCF_TS) =3D=3D 0)
+		return (ncp->nc_name);
+	ncp_ts =3D (struct namecache_ts *)ncp;
+	return (ncp_ts->nc_name);
+}
=20
 static int	doingcache =3D 1;		/* 1 =3D> enable the cache */
 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0,
@@ -235,12 +287,6 @@ static int vn_fullpath1(struct thread *td, struct vnod=
e *vp, struct vnode *rdir,
=20
 static MALLOC_DEFINE(M_VFSCACHE, "vfscache", "VFS name cache entries");
=20
-/*
- * Flags in namecache.nc_flag
- */
-#define NCF_WHITE	0x01
-#define NCF_ISDOTDOT	0x02
-
 #ifdef DIAGNOSTIC
 /*
  * Grab an atomic snapshot of the name cache hash chain lengths
@@ -346,10 +392,10 @@ cache_zap(ncp)
 #ifdef KDTRACE_HOOKS
 	if (ncp->nc_vp !=3D NULL) {
 		SDT_PROBE(vfs, namecache, zap, done, ncp->nc_dvp,
-		    ncp->nc_name, ncp->nc_vp, 0, 0);
+		    nc_get_name(ncp), ncp->nc_vp, 0, 0);
 	} else {
 		SDT_PROBE(vfs, namecache, zap_negative, done, ncp->nc_dvp,
-		    ncp->nc_name, 0, 0, 0);
+		    nc_get_name(ncp), 0, 0, 0);
 	}
 #endif
 	vp =3D NULL;
@@ -460,10 +506,17 @@ retry_wlocked:
 			    dvp, cnp->cn_nameptr, *vpp);
 			SDT_PROBE(vfs, namecache, lookup, hit, dvp, "..",
 			    *vpp, 0, 0);
-			if (tsp !=3D NULL)
-				*tsp =3D ncp->nc_time;
-			if (ticksp !=3D NULL)
-				*ticksp =3D ncp->nc_ticks;
+			if (tsp !=3D NULL) {
+				KASSERT((ncp->nc_flag & NCF_TS) !=3D 0,
+				    ("No NCF_TS"));
+				*tsp =3D ((struct namecache_ts *)ncp)->nc_time;
+			}
+			if (ticksp !=3D NULL) {
+				KASSERT((ncp->nc_flag & NCF_TS) !=3D 0,
+				    ("No NCF_TS"));
+				*ticksp =3D ((struct namecache_ts *)ncp)->
+				    nc_ticks;
+			}
 			goto success;
 		}
 	}
@@ -473,7 +526,7 @@ retry_wlocked:
 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 		numchecks++;
 		if (ncp->nc_dvp =3D=3D dvp && ncp->nc_nlen =3D=3D cnp->cn_namelen &&
-		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
+		    !bcmp(nc_get_name(ncp), cnp->cn_nameptr, ncp->nc_nlen))
 			break;
 	}
=20
@@ -508,12 +561,16 @@ retry_wlocked:
 		*vpp =3D ncp->nc_vp;
 		CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
 		    dvp, cnp->cn_nameptr, *vpp, ncp);
-		SDT_PROBE(vfs, namecache, lookup, hit, dvp, ncp->nc_name,
+		SDT_PROBE(vfs, namecache, lookup, hit, dvp, nc_get_name(ncp),
 		    *vpp, 0, 0);
-		if (tsp !=3D NULL)
-			*tsp =3D ncp->nc_time;
-		if (ticksp !=3D NULL)
-			*ticksp =3D ncp->nc_ticks;
+		if (tsp !=3D NULL) {
+			KASSERT((ncp->nc_flag & NCF_TS) !=3D 0, ("No NCF_TS"));
+			*tsp =3D ((struct namecache_ts *)ncp)->nc_time;
+		}
+		if (ticksp !=3D NULL) {
+			KASSERT((ncp->nc_flag & NCF_TS) !=3D 0, ("No NCF_TS"));
+			*ticksp =3D ((struct namecache_ts *)ncp)->nc_ticks;
+		}
 		goto success;
 	}
=20
@@ -543,12 +600,16 @@ negative_success:
 	nchstats.ncs_neghits++;
 	if (ncp->nc_flag & NCF_WHITE)
 		cnp->cn_flags |=3D ISWHITEOUT;
-	SDT_PROBE(vfs, namecache, lookup, hit_negative, dvp, ncp->nc_name,
+	SDT_PROBE(vfs, namecache, lookup, hit_negative, dvp, nc_get_name(ncp),
 	    0, 0, 0);
-	if (tsp !=3D NULL)
-		*tsp =3D ncp->nc_time;
-	if (ticksp !=3D NULL)
-		*ticksp =3D ncp->nc_ticks;
+	if (tsp !=3D NULL) {
+		KASSERT((ncp->nc_flag & NCF_TS) !=3D 0, ("No NCF_TS"));
+		*tsp =3D ((struct namecache_ts *)ncp)->nc_time;
+	}
+	if (ticksp !=3D NULL) {
+		KASSERT((ncp->nc_flag & NCF_TS) !=3D 0, ("No NCF_TS"));
+		*ticksp =3D ((struct namecache_ts *)ncp)->nc_ticks;
+	}
 	CACHE_WUNLOCK();
 	return (ENOENT);
=20
@@ -642,6 +703,7 @@ cache_enter_time(dvp, vp, cnp, tsp)
 	struct timespec *tsp;
 {
 	struct namecache *ncp, *n2;
+	struct namecache_ts *n3;
 	struct nchashhead *ncpp;
 	uint32_t hash;
 	int flag;
@@ -708,18 +770,19 @@ cache_enter_time(dvp, vp, cnp, tsp)
 	 * Calculate the hash key and setup as much of the new
 	 * namecache entry as possible before acquiring the lock.
 	 */
-	ncp =3D cache_alloc(cnp->cn_namelen);
+	ncp =3D cache_alloc(cnp->cn_namelen, tsp !=3D NULL);
 	ncp->nc_vp =3D vp;
 	ncp->nc_dvp =3D dvp;
 	ncp->nc_flag =3D flag;
-	if (tsp !=3D NULL)
-		ncp->nc_time =3D *tsp;
-	else
-		timespecclear(&ncp->nc_time);
-	ncp->nc_ticks =3D ticks;
+	if (tsp !=3D NULL) {
+		n3 =3D (struct namecache_ts *)ncp;
+		n3->nc_time =3D *tsp;
+		n3->nc_ticks =3D ticks;
+		n3->nc_flag |=3D NCF_TS;
+	}
 	len =3D ncp->nc_nlen =3D cnp->cn_namelen;
 	hash =3D fnv_32_buf(cnp->cn_nameptr, len, FNV1_32_INIT);
-	strlcpy(ncp->nc_name, cnp->cn_nameptr, len + 1);
+	strlcpy(nc_get_name(ncp), cnp->cn_nameptr, len + 1);
 	hash =3D fnv_32_buf(&dvp, sizeof(dvp), hash);
 	CACHE_WLOCK();
=20
@@ -732,9 +795,16 @@ cache_enter_time(dvp, vp, cnp, tsp)
 	LIST_FOREACH(n2, ncpp, nc_hash) {
 		if (n2->nc_dvp =3D=3D dvp &&
 		    n2->nc_nlen =3D=3D cnp->cn_namelen &&
-		    !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
-			n2->nc_time =3D ncp->nc_time;
-			n2->nc_ticks =3D ncp->nc_ticks;
+		    !bcmp(nc_get_name(n2), cnp->cn_nameptr, n2->nc_nlen)) {
+			if (tsp !=3D NULL) {
+				KASSERT((n2->nc_flag & NCF_TS) !=3D 0,
+				    ("no NCF_TS"));
+				n3 =3D (struct namecache_ts *)n2;
+				n3->nc_time =3D
+				    ((struct namecache_ts *)ncp)->nc_time;
+				n3->nc_ticks =3D
+				    ((struct namecache_ts *)ncp)->nc_ticks;
+			}
 			CACHE_WUNLOCK();
 			cache_free(ncp);
 			return;
@@ -792,12 +862,12 @@ cache_enter_time(dvp, vp, cnp, tsp)
 	 */
 	if (vp) {
 		TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
-		SDT_PROBE(vfs, namecache, enter, done, dvp, ncp->nc_name, vp,
-		    0, 0);
+		SDT_PROBE(vfs, namecache, enter, done, dvp, nc_get_name(ncp),
+		    vp, 0, 0);
 	} else {
 		TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
 		SDT_PROBE(vfs, namecache, enter_negative, done, dvp,
-		    ncp->nc_name, 0, 0, 0);
+		    nc_get_name(ncp), 0, 0, 0);
 	}
 	if (numneg * ncnegfactor > numcache) {
 		ncp =3D TAILQ_FIRST(&ncneg);
@@ -819,10 +889,15 @@ nchinit(void *dummy __unused)
=20
 	TAILQ_INIT(&ncneg);
=20
-	cache_zone_small =3D uma_zcreate("S VFS Cache", CACHE_ZONE_SMALL, NULL,
-	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
-	cache_zone_large =3D uma_zcreate("L VFS Cache", CACHE_ZONE_LARGE, NULL,
-	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
+	cache_zone_small =3D uma_zcreate("S VFS Cache",
+	    sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1,
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
+	cache_zone_small_ts =3D uma_zcreate("STS VFS Cache",
+	    sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1,
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
+	cache_zone_large =3D uma_zcreate("L VFS Cache",
+	    sizeof(struct namecache_ts) + NAME_MAX + 1,
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
=20
 	nchashtbl =3D hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
 }
@@ -1126,9 +1201,9 @@ vn_vptocnp_locked(struct vnode **vp, struct ucred *cr=
ed, char *buf,
 			return (error);
 		}
 		*buflen -=3D ncp->nc_nlen;
-		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
+		memcpy(buf + *buflen, nc_get_name(ncp), ncp->nc_nlen);
 		SDT_PROBE(vfs, namecache, fullpath, hit, ncp->nc_dvp,
-		    ncp->nc_name, vp, 0, 0);
+		    nc_get_name(ncp), vp, 0, 0);
 		dvp =3D *vp;
 		*vp =3D ncp->nc_dvp;
 		vref(*vp);
@@ -1301,7 +1376,7 @@ vn_commname(struct vnode *vp, char *buf, u_int buflen)
 		return (ENOENT);
 	}
 	l =3D min(ncp->nc_nlen, buflen - 1);
-	memcpy(buf, ncp->nc_name, l);
+	memcpy(buf, nc_get_name(ncp), l);
 	CACHE_RUNLOCK();
 	buf[l] =3D '\0';
 	return (0);


--WWZbIj0qUfl+CqYL
Content-Type: application/pgp-signature
Content-Disposition: inline

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.11 (FreeBSD)

iEUEARECAAYFAk8ac4kACgkQC3+MBN1Mb4jkYgCWO9MfVJ4mkdtj82/pzDgnL6Xh
QwCgljVaycgmJVN79e0ObL9ArI6iTBs=
=mM5R
-----END PGP SIGNATURE-----

--WWZbIj0qUfl+CqYL--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20120121081257.GS31224>