Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 10 Sep 2016 16:29:53 +0000 (UTC)
From:      Mateusz Guzik <mjg@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r305684 - head/sys/kern
Message-ID:  <201609101629.u8AGTrmG013017@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mjg
Date: Sat Sep 10 16:29:53 2016
New Revision: 305684
URL: https://svnweb.freebsd.org/changeset/base/305684

Log:
  cache: improve scalability by introducing bucket locks
  
  An array of bucket locks is added.
  
  All modifications still require the global cache_lock to be held for
  writing. However, most readers only need the relevant bucket lock and in
  effect can run concurrently to the writer as long as they use a
  different lock. See the added comment for more details.
  
  This is an intermediate step towards removal of the global lock.
  
  Reviewed by:	kib
  Tested by:	pho

Modified:
  head/sys/kern/subr_witness.c
  head/sys/kern/vfs_cache.c

Modified: head/sys/kern/subr_witness.c
==============================================================================
--- head/sys/kern/subr_witness.c	Sat Sep 10 16:11:42 2016	(r305683)
+++ head/sys/kern/subr_witness.c	Sat Sep 10 16:29:53 2016	(r305684)
@@ -623,6 +623,14 @@ static struct witness_order_list_entry o
 	{ "vnode interlock", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
+	 * VFS namecache
+	 */
+	{ "ncglobal", &lock_class_rw },
+	{ "ncbuc", &lock_class_rw },
+	{ "vnode interlock", &lock_class_mtx_sleep },
+	{ "ncneg", &lock_class_mtx_sleep },
+	{ NULL, NULL },
+	/*
 	 * ZFS locking
 	 */
 	{ "dn->dn_mtx", &lock_class_sx },

Modified: head/sys/kern/vfs_cache.c
==============================================================================
--- head/sys/kern/vfs_cache.c	Sat Sep 10 16:11:42 2016	(r305683)
+++ head/sys/kern/vfs_cache.c	Sat Sep 10 16:29:53 2016	(r305684)
@@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sdt.h>
+#include <sys/smp.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
@@ -148,6 +149,23 @@ struct	namecache_ts {
  * Upon reaching the last segment of a path, if the reference
  * is for DELETE, or NOCACHE is set (rewrite), and the
  * name is located in the cache, it will be dropped.
+ *
+ * These locks are used (in the order in which they can be taken):
+ * NAME         TYPE    ROLE
+ * cache_lock   rwlock  global, needed for all modifications
+ * bucketlock   rwlock  for access to given hash bucket
+ * ncneg_mtx    mtx     negative entry LRU management
+ *
+ * A name -> vnode lookup can be safely performed by either locking cache_lock
+ * or the relevant hash bucket.
+ *
+ * ".." and vnode -> name lookups require cache_lock.
+ *
+ * Modifications require both cache_lock and relevant bucketlock taken for
+ * writing.
+ *
+ * Negative entry LRU management requires ncneg_mtx taken on top of either
+ * cache_lock or bucketlock.
  */
 
 /*
@@ -179,8 +197,9 @@ SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor
 struct nchstats	nchstats;		/* cache effectiveness statistics */
 
 static struct rwlock cache_lock;
-RW_SYSINIT(vfscache, &cache_lock, "Name Cache");
+RW_SYSINIT(vfscache, &cache_lock, "ncglobal");
 
+#define	CACHE_TRY_WLOCK()	rw_try_wlock(&cache_lock)
 #define	CACHE_UPGRADE_LOCK()	rw_try_upgrade(&cache_lock)
 #define	CACHE_RLOCK()		rw_rlock(&cache_lock)
 #define	CACHE_RUNLOCK()		rw_runlock(&cache_lock)
@@ -188,7 +207,12 @@ RW_SYSINIT(vfscache, &cache_lock, "Name 
 #define	CACHE_WUNLOCK()		rw_wunlock(&cache_lock)
 
 static struct mtx_padalign ncneg_mtx;
-MTX_SYSINIT(vfscache_neg, &ncneg_mtx, "Name Cache neg", MTX_DEF);
+MTX_SYSINIT(vfscache_neg, &ncneg_mtx, "ncneg", MTX_DEF);
+
+static u_int   numbucketlocks;
+static struct rwlock_padalign  *bucketlocks;
+#define	HASH2BUCKETLOCK(hash) \
+	((struct rwlock *)(&bucketlocks[((hash) % numbucketlocks)]))
 
 /*
  * UMA zones for the VFS cache.
@@ -307,6 +331,8 @@ STATNODE_COUNTER(numfullpathfail4, "Numb
 STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
 static long numupgrades; STATNODE_ULONG(numupgrades,
     "Number of updates of the cache after lookup (write lock + retry)");
+static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
+    "Number of times bucketlocked zap_and_exit case failed to writelock");
 
 static void cache_zap(struct namecache *ncp);
 static int vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf,
@@ -326,6 +352,39 @@ cache_get_hash(char *name, u_char len, s
 	return (hash);
 }
 
+#ifdef INVARIANTS
+static void
+cache_assert_bucket_locked(struct namecache *ncp, int mode)
+{
+	struct rwlock *bucketlock;
+	uint32_t hash;
+
+	hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp);
+	bucketlock = HASH2BUCKETLOCK(hash);
+	rw_assert(bucketlock, mode);
+}
+#else
+#define cache_assert_bucket_locked(x, y) do { } while (0)
+#endif
+
+static void
+cache_lock_all_buckets(void)
+{
+	u_int i;
+
+	for (i = 0; i < numbucketlocks; i++)
+		rw_wlock(&bucketlocks[i]);
+}
+
+static void
+cache_unlock_all_buckets(void)
+{
+	u_int i;
+
+	for (i = 0; i < numbucketlocks; i++)
+		rw_wunlock(&bucketlocks[i]);
+}
+
 static int
 sysctl_nchstats(SYSCTL_HANDLER_ARGS)
 {
@@ -442,21 +501,13 @@ SYSCTL_PROC(_debug_hashstat, OID_AUTO, n
  * Negative entries management
  */
 static void
-cache_negative_hit(struct namecache *ncp, int wlocked)
+cache_negative_hit(struct namecache *ncp)
 {
 
-	if (!wlocked) {
-		rw_assert(&cache_lock, RA_RLOCKED);
-		mtx_lock(&ncneg_mtx);
-	} else {
-		rw_assert(&cache_lock, RA_WLOCKED);
-	}
-
+	mtx_lock(&ncneg_mtx);
 	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
 	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
-
-	if (!wlocked)
-		mtx_unlock(&ncneg_mtx);
+	mtx_unlock(&ncneg_mtx);
 }
 
 static void
@@ -464,9 +515,12 @@ cache_negative_insert(struct namecache *
 {
 
 	rw_assert(&cache_lock, RA_WLOCKED);
+	cache_assert_bucket_locked(ncp, RA_WLOCKED);
 	MPASS(ncp->nc_vp == NULL);
+	mtx_lock(&ncneg_mtx);
 	TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
 	numneg++;
+	mtx_unlock(&ncneg_mtx);
 }
 
 static void
@@ -474,9 +528,12 @@ cache_negative_remove(struct namecache *
 {
 
 	rw_assert(&cache_lock, RA_WLOCKED);
+	cache_assert_bucket_locked(ncp, RA_WLOCKED);
 	MPASS(ncp->nc_vp == NULL);
+	mtx_lock(&ncneg_mtx);
 	TAILQ_REMOVE(&ncneg, ncp, nc_dst);
 	numneg--;
+	mtx_unlock(&ncneg_mtx);
 }
 
 static struct namecache *
@@ -499,10 +556,11 @@ cache_negative_zap_one(void)
  *   pointer to a vnode or if it is just a negative cache entry.
  */
 static void
-cache_zap(struct namecache *ncp)
+cache_zap_locked(struct namecache *ncp)
 {
 
 	rw_assert(&cache_lock, RA_WLOCKED);
+	cache_assert_bucket_locked(ncp, RA_WLOCKED);
 	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp);
 	if (ncp->nc_vp != NULL) {
 		SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
@@ -532,6 +590,21 @@ cache_zap(struct namecache *ncp)
 	numcache--;
 }
 
+static void
+cache_zap(struct namecache *ncp)
+{
+	struct rwlock *bucketlock;
+	uint32_t hash;
+
+	rw_assert(&cache_lock, RA_WLOCKED);
+
+	hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp);
+	bucketlock = HASH2BUCKETLOCK(hash);
+	rw_wlock(bucketlock);
+	cache_zap_locked(ncp);
+	rw_wunlock(bucketlock);
+}
+
 /*
  * Lookup an entry in the cache
  *
@@ -549,22 +622,42 @@ cache_zap(struct namecache *ncp)
  * not recursively acquired.
  */
 
+enum { UNLOCKED, WLOCKED, RLOCKED };
+
+static void
+cache_unlock(int cache_locked)
+{
+
+	switch (cache_locked) {
+	case UNLOCKED:
+		break;
+	case WLOCKED:
+		CACHE_WUNLOCK();
+		break;
+	case RLOCKED:
+		CACHE_RUNLOCK();
+		break;
+	}
+}
+
 int
 cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
     struct timespec *tsp, int *ticksp)
 {
+	struct rwlock *bucketlock;
 	struct namecache *ncp;
 	uint32_t hash;
-	int error, ltype, wlocked;
+	int error, ltype, cache_locked;
 
 	if (!doingcache) {
 		cnp->cn_flags &= ~MAKEENTRY;
 		return (0);
 	}
 retry:
-	wlocked = 0;
-	counter_u64_add(numcalls, 1);
+	bucketlock = NULL;
+	cache_locked = UNLOCKED;
 	error = 0;
+	counter_u64_add(numcalls, 1);
 
 retry_wlocked:
 	if (cnp->cn_nameptr[0] == '.') {
@@ -598,17 +691,21 @@ retry_wlocked:
 			}
 			return (-1);
 		}
-		if (!wlocked)
-			CACHE_RLOCK();
 		if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
 			counter_u64_add(dotdothits, 1);
+			if (cache_locked == UNLOCKED) {
+				CACHE_RLOCK();
+				cache_locked = RLOCKED;
+			}
+
 			if (dvp->v_cache_dd == NULL) {
 				SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
 				    "..", NULL);
 				goto unlock;
 			}
 			if ((cnp->cn_flags & MAKEENTRY) == 0) {
-				if (!wlocked && !CACHE_UPGRADE_LOCK())
+				if (cache_locked != WLOCKED &&
+				    !CACHE_UPGRADE_LOCK())
 					goto wlock;
 				ncp = NULL;
 				if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT) {
@@ -639,10 +736,14 @@ retry_wlocked:
 				    nc_dotdottime;
 			goto success;
 		}
-	} else if (!wlocked)
-		CACHE_RLOCK();
+	}
 
 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
+	if (cache_locked == UNLOCKED) {
+		bucketlock = HASH2BUCKETLOCK(hash);
+		rw_rlock(bucketlock);
+	}
+
 	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 		counter_u64_add(numchecks, 1);
 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
@@ -665,12 +766,7 @@ retry_wlocked:
 	/* We don't want to have an entry, so dump it */
 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
 		counter_u64_add(numposzaps, 1);
-		if (!wlocked && !CACHE_UPGRADE_LOCK())
-			goto wlock;
-		cache_zap(ncp);
-		CACHE_WUNLOCK();
-		cache_free(ncp);
-		return (0);
+		goto zap_and_exit;
 	}
 
 	/* We found a "positive" match, return the vnode */
@@ -689,25 +785,20 @@ negative_success:
 	/* We found a negative match, and want to create it, so purge */
 	if (cnp->cn_nameiop == CREATE) {
 		counter_u64_add(numnegzaps, 1);
-		if (!wlocked && !CACHE_UPGRADE_LOCK())
-			goto wlock;
-		cache_zap(ncp);
-		CACHE_WUNLOCK();
-		cache_free(ncp);
-		return (0);
+		goto zap_and_exit;
 	}
 
 	counter_u64_add(numneghits, 1);
-	cache_negative_hit(ncp, wlocked);
+	cache_negative_hit(ncp);
 	if (ncp->nc_flag & NCF_WHITE)
 		cnp->cn_flags |= ISWHITEOUT;
 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
 	    nc_get_name(ncp));
 	cache_out_ts(ncp, tsp, ticksp);
-	if (wlocked)
-		CACHE_WUNLOCK();
-	else
-		CACHE_RUNLOCK();
+	MPASS(bucketlock != NULL || cache_locked != UNLOCKED);
+	if (bucketlock != NULL)
+		rw_runlock(bucketlock);
+	cache_unlock(cache_locked);
 	return (ENOENT);
 
 wlock:
@@ -716,9 +807,10 @@ wlock:
 	 * a write lock and retry the operation.
 	 */
 	CACHE_RUNLOCK();
+wlock_unlocked:
 	CACHE_WLOCK();
 	numupgrades++;
-	wlocked = 1;
+	cache_locked = WLOCKED;
 	goto retry_wlocked;
 
 success:
@@ -733,10 +825,10 @@ success:
 		VOP_UNLOCK(dvp, 0);
 	}
 	vhold(*vpp);
-	if (wlocked)
-		CACHE_WUNLOCK();
-	else
-		CACHE_RUNLOCK();
+	MPASS(bucketlock != NULL || cache_locked != UNLOCKED);
+	if (bucketlock != NULL)
+		rw_runlock(bucketlock);
+	cache_unlock(cache_locked);
 	error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread);
 	if (cnp->cn_flags & ISDOTDOT) {
 		vn_lock(dvp, ltype | LK_RETRY);
@@ -758,10 +850,29 @@ success:
 	return (-1);
 
 unlock:
-	if (wlocked)
-		CACHE_WUNLOCK();
-	else
-		CACHE_RUNLOCK();
+	MPASS(bucketlock != NULL || cache_locked != UNLOCKED);
+	if (bucketlock != NULL)
+		rw_runlock(bucketlock);
+	cache_unlock(cache_locked);
+	return (0);
+
+zap_and_exit:
+	if (bucketlock != NULL) {
+		rw_assert(&cache_lock, RA_UNLOCKED);
+		if (!CACHE_TRY_WLOCK()) {
+			rw_runlock(bucketlock);
+			bucketlock = NULL;
+			zap_and_exit_bucket_fail++;
+			goto wlock_unlocked;
+		}
+		cache_locked = WLOCKED;
+		rw_runlock(bucketlock);
+		bucketlock = NULL;
+	} else if (cache_locked != WLOCKED && !CACHE_UPGRADE_LOCK())
+		goto wlock;
+	cache_zap(ncp);
+	CACHE_WUNLOCK();
+	cache_free(ncp);
 	return (0);
 }
 
@@ -772,6 +883,7 @@ void
 cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
     struct timespec *tsp, struct timespec *dtsp)
 {
+	struct rwlock *bucketlock;
 	struct namecache *ncp, *n2, *ndd, *nneg;
 	struct namecache_ts *n3;
 	struct nchashhead *ncpp;
@@ -924,11 +1036,6 @@ cache_enter_time(struct vnode *dvp, stru
 		}
 	}
 
-	/*
-	 * Insert the new namecache entry into the appropriate chain
-	 * within the cache entries table.
-	 */
-	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
 	if (flag != NCF_ISDOTDOT) {
 		if (LIST_EMPTY(&dvp->v_cache_src)) {
 			vhold(dvp);
@@ -937,6 +1044,15 @@ cache_enter_time(struct vnode *dvp, stru
 		LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
 	}
 
+	bucketlock = HASH2BUCKETLOCK(hash);
+	rw_wlock(bucketlock);
+
+	/*
+	 * Insert the new namecache entry into the appropriate chain
+	 * within the cache entries table.
+	 */
+	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
+
 	/*
 	 * If the entry is "negative", we place it into the
 	 * "negative" cache queue, otherwise, we place it into the
@@ -953,6 +1069,7 @@ cache_enter_time(struct vnode *dvp, stru
 		SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
 		    nc_get_name(ncp));
 	}
+	rw_wunlock(bucketlock);
 	if (numneg * ncnegfactor > numcache)
 		nneg = cache_negative_zap_one();
 	CACHE_WUNLOCK();
@@ -960,12 +1077,24 @@ cache_enter_time(struct vnode *dvp, stru
 	cache_free(nneg);
 }
 
+static u_int
+cache_roundup_2(u_int val)
+{
+	u_int res;
+
+	for (res = 1; res <= val; res <<= 1)
+		continue;
+
+	return (res);
+}
+
 /*
  * Name cache initialization, from vfs_init() when we are booting
  */
 static void
 nchinit(void *dummy __unused)
 {
+	u_int i;
 
 	TAILQ_INIT(&ncneg);
 
@@ -983,6 +1112,13 @@ nchinit(void *dummy __unused)
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
 
 	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
+	numbucketlocks = cache_roundup_2(mp_ncpus * 16);
+	if (numbucketlocks > nchash)
+		numbucketlocks = nchash;
+	bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
+	    M_WAITOK | M_ZERO);
+	for (i = 0; i < numbucketlocks; i++)
+		rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK);
 
 	numcalls = counter_u64_alloc(M_WAITOK);
 	dothits = counter_u64_alloc(M_WAITOK);
@@ -1023,6 +1159,7 @@ cache_changesize(int newmaxvnodes)
 	 * because to do so, they have to be removed from the hash table.
 	 */
 	CACHE_WLOCK();
+	cache_lock_all_buckets();
 	old_nchashtbl = nchashtbl;
 	old_nchash = nchash;
 	nchashtbl = new_nchashtbl;
@@ -1035,6 +1172,7 @@ cache_changesize(int newmaxvnodes)
 			LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
 		}
 	}
+	cache_unlock_all_buckets();
 	CACHE_WUNLOCK();
 	free(old_nchashtbl, M_VFSCACHE);
 }
@@ -1108,20 +1246,30 @@ void
 cache_purgevfs(struct mount *mp)
 {
 	TAILQ_HEAD(, namecache) ncps;
-	struct nchashhead *ncpp;
+	struct rwlock *bucketlock;
+	struct nchashhead *bucket;
 	struct namecache *ncp, *nnp;
+	u_long i, j, n_nchash;
 
 	/* Scan hash tables for applicable entries */
 	SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
 	TAILQ_INIT(&ncps);
 	CACHE_WLOCK();
-	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
-		LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) {
-			if (ncp->nc_dvp->v_mount != mp)
-				continue;
-			cache_zap(ncp);
-			TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
+	n_nchash = nchash + 1;
+	for (i = 0; i < numbucketlocks; i++) {
+		bucketlock = (struct rwlock *)&bucketlocks[i];
+		rw_wlock(bucketlock);
+		for (j = i; j < n_nchash; j += numbucketlocks) {
+			bucket = &nchashtbl[j];
+			LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
+				cache_assert_bucket_locked(ncp, RA_WLOCKED);
+				if (ncp->nc_dvp->v_mount != mp)
+					continue;
+				cache_zap_locked(ncp);
+				TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
+			}
 		}
+		rw_wunlock(bucketlock);
 	}
 	CACHE_WUNLOCK();
 	TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201609101629.u8AGTrmG013017>