From owner-svn-src-head@freebsd.org Wed Jul 1 05:59:09 2020 Return-Path: Delivered-To: svn-src-head@mailman.nyi.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1]) by mailman.nyi.freebsd.org (Postfix) with ESMTP id 94C283633B9; Wed, 1 Jul 2020 05:59:09 +0000 (UTC) (envelope-from mjg@FreeBSD.org) Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org [IPv6:2610:1c1:1:606c::19:3]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256 client-signature RSA-PSS (4096 bits) client-digest SHA256) (Client CN "mxrelay.nyi.freebsd.org", Issuer "Let's Encrypt Authority X3" (verified OK)) by mx1.freebsd.org (Postfix) with ESMTPS id 49xVsn3Gcqz3y2G; Wed, 1 Jul 2020 05:59:09 +0000 (UTC) (envelope-from mjg@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id 536A4244E0; Wed, 1 Jul 2020 05:59:09 +0000 (UTC) (envelope-from mjg@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id 0615x94R036297; Wed, 1 Jul 2020 05:59:09 GMT (envelope-from mjg@FreeBSD.org) Received: (from mjg@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id 0615x9Da036296; Wed, 1 Jul 2020 05:59:09 GMT (envelope-from mjg@FreeBSD.org) Message-Id: <202007010559.0615x9Da036296@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: mjg set sender to mjg@FreeBSD.org using -f From: Mateusz Guzik Date: Wed, 1 Jul 2020 05:59:09 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r362828 - head/sys/kern X-SVN-Group: head X-SVN-Commit-Author: mjg X-SVN-Commit-Paths: head/sys/kern X-SVN-Commit-Revision: 362828 X-SVN-Commit-Repository: base MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-head@freebsd.org X-Mailman-Version: 2.1.33 Precedence: list List-Id: SVN commit messages for the src tree for head/-current List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 01 Jul 2020 05:59:09 -0000 Author: mjg Date: Wed Jul 1 05:59:08 2020 New Revision: 362828 URL: https://svnweb.freebsd.org/changeset/base/362828 Log: cache: lockless forward lookup with smr This eliminates the need to take bucket locks in the common case. Concurrent lookup utilizng the same vnodes is still bottlenecked on referencing and locking path components, this will be taken care of separately. Reviewed by: kib Tested by: pho Differential Revision: https://reviews.freebsd.org/D23913 Modified: head/sys/kern/vfs_cache.c Modified: head/sys/kern/vfs_cache.c ============================================================================== --- head/sys/kern/vfs_cache.c Wed Jul 1 05:56:29 2020 (r362827) +++ head/sys/kern/vfs_cache.c Wed Jul 1 05:59:08 2020 (r362828) @@ -56,11 +56,13 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include #include #include +#include #ifdef KTRACE #include #endif @@ -104,7 +106,7 @@ SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, don */ struct namecache { - LIST_ENTRY(namecache) nc_hash; /* hash chain */ + CK_LIST_ENTRY(namecache) nc_hash;/* hash chain */ LIST_ENTRY(namecache) nc_src; /* source vnode list */ TAILQ_ENTRY(namecache) nc_dst; /* destination vnode list */ struct vnode *nc_dvp; /* vnode of parent of name */ @@ -143,7 +145,26 @@ struct namecache_ts { #define NCF_DVDROP 0x10 #define NCF_NEGATIVE 0x20 #define NCF_HOTNEGATIVE 0x40 +#define NCF_INVALID 0x80 +static bool +cache_ncp_invalid(struct namecache *ncp) +{ + + atomic_thread_fence_acq(); + return ((ncp->nc_flag & NCF_INVALID) != 0); +} + +static void +cache_ncp_invalidate(struct namecache *ncp) +{ + + atomic_thread_fence_rel(); + KASSERT((ncp->nc_flag & NCF_INVALID) == 0, + ("%s: entry %p already invalid", __func__, ncp)); + ncp->nc_flag |= NCF_INVALID; +} + /* * Name caching works as follows: * @@ -192,12 +213,14 @@ struct namecache_ts { * the first node, locking everything in order and revalidating the state. */ +VFS_SMR_DECLARE; + /* * Structures associated with name caching. */ #define NCHHASH(hash) \ (&nchashtbl[(hash) & nchash]) -static __read_mostly LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ +static __read_mostly CK_LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */ static u_long __read_mostly nchash; /* size of hash table */ SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, "Size of namecache hash table"); @@ -275,15 +298,15 @@ cache_alloc(int len, int ts) if (__predict_false(ts)) { if (len <= CACHE_PATH_CUTOFF) - ncp_ts = uma_zalloc(cache_zone_small_ts, M_WAITOK); + ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK); else - ncp_ts = uma_zalloc(cache_zone_large_ts, M_WAITOK); + ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK); ncp = &ncp_ts->nc_nc; } else { if (len <= CACHE_PATH_CUTOFF) - ncp = uma_zalloc(cache_zone_small, M_WAITOK); + ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK); else - ncp = uma_zalloc(cache_zone_large, M_WAITOK); + ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK); } return (ncp); } @@ -300,14 +323,14 @@ cache_free(struct namecache *ncp) if (__predict_false(ncp->nc_flag & NCF_TS)) { ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc); if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) - uma_zfree(cache_zone_small_ts, ncp_ts); + uma_zfree_smr(cache_zone_small_ts, ncp_ts); else - uma_zfree(cache_zone_large_ts, ncp_ts); + uma_zfree_smr(cache_zone_large_ts, ncp_ts); } else { if (ncp->nc_nlen <= CACHE_PATH_CUTOFF) - uma_zfree(cache_zone_small, ncp); + uma_zfree_smr(cache_zone_small, ncp); else - uma_zfree(cache_zone_large, ncp); + uma_zfree_smr(cache_zone_large, ncp); } } @@ -606,7 +629,7 @@ retry: } /* Scan hash tables counting entries */ for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++) - LIST_FOREACH(ncp, ncpp, nc_hash) + CK_LIST_FOREACH(ncp, ncpp, nc_hash) cntbuf[i]++; cache_unlock_all_buckets(); for (error = 0, i = 0; i < n_nchash; i++) @@ -639,7 +662,7 @@ sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS) /* Scan hash tables for applicable entries */ for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) { count = 0; - LIST_FOREACH(ncp, ncpp, nc_hash) { + CK_LIST_FOREACH(ncp, ncpp, nc_hash) { count++; } if (count) @@ -860,7 +883,10 @@ cache_zap_locked(struct namecache *ncp, bool neg_locke CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp); - LIST_REMOVE(ncp, nc_hash); + + cache_ncp_invalidate(ncp); + + CK_LIST_REMOVE(ncp, nc_hash); if (!(ncp->nc_flag & NCF_NEGATIVE)) { SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp, ncp->nc_name, ncp->nc_vp); @@ -1013,7 +1039,7 @@ cache_zap_unlocked_bucket(struct namecache *ncp, struc cache_sort_vnodes(&dvlp, &vlp); cache_lock_vnodes(dvlp, vlp); rw_wlock(blp); - LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { + CK_LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) { if (rncp == ncp && rncp->nc_dvp == dvp && rncp->nc_nlen == cnp->cn_namelen && !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen)) @@ -1225,12 +1251,12 @@ retry_dotdot: hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); blp = HASH2BUCKETLOCK(hash); retry: - if (LIST_EMPTY(NCHHASH(hash))) + if (CK_LIST_EMPTY(NCHHASH(hash))) goto out_no_entry; rw_wlock(blp); - LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { + CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { counter_u64_add(numchecks, 1); if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) @@ -1305,6 +1331,7 @@ cache_lookup(struct vnode *dvp, struct vnode **vpp, st uint32_t hash; enum vgetstate vs; int error, ltype; + bool try_smr, doing_smr; #ifdef DEBUG_CACHE if (__predict_false(!doingcache)) { @@ -1321,7 +1348,11 @@ cache_lookup(struct vnode *dvp, struct vnode **vpp, st if ((cnp->cn_flags & MAKEENTRY) == 0) return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp)); + try_smr = true; + if (cnp->cn_nameiop == CREATE) + try_smr = false; retry: + doing_smr = false; blp = NULL; dvlp = NULL; error = 0; @@ -1361,10 +1392,17 @@ retry: } hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp); - blp = HASH2BUCKETLOCK(hash); - rw_rlock(blp); +retry_hashed: + if (try_smr) { + vfs_smr_enter(); + doing_smr = true; + try_smr = false; + } else { + blp = HASH2BUCKETLOCK(hash); + rw_rlock(blp); + } - LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { + CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) { counter_u64_add(numchecks, 1); if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) @@ -1373,7 +1411,10 @@ retry: /* We failed to find an entry */ if (__predict_false(ncp == NULL)) { - rw_runlock(blp); + if (doing_smr) + vfs_smr_exit(); + else + rw_runlock(blp); SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr, NULL); counter_u64_add(nummiss, 1); @@ -1402,8 +1443,22 @@ success: ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK(dvp); } - vs = vget_prep(*vpp); - cache_lookup_unlock(blp, dvlp); + if (doing_smr) { + if (cache_ncp_invalid(ncp)) { + vfs_smr_exit(); + *vpp = NULL; + goto retry; + } + vs = vget_prep_smr(*vpp); + vfs_smr_exit(); + if (vs == VGET_NONE) { + *vpp = NULL; + goto retry; + } + } else { + vs = vget_prep(*vpp); + cache_lookup_unlock(blp, dvlp); + } error = vget_finish(*vpp, cnp->cn_lkflags, vs); if (cnp->cn_flags & ISDOTDOT) { vn_lock(dvp, ltype | LK_RETRY); @@ -1427,21 +1482,37 @@ success: negative_success: /* We found a negative match, and want to create it, so purge */ if (cnp->cn_nameiop == CREATE) { + MPASS(!doing_smr); counter_u64_add(numnegzaps, 1); goto zap_and_exit; } + if (doing_smr) { + if ((ncp->nc_flag & NCF_HOTNEGATIVE) == 0) { + /* + * We need to take locks to promote the entry. + */ + vfs_smr_exit(); + doing_smr = false; + goto retry_hashed; + } + } else { + cache_negative_hit(ncp); + } counter_u64_add(numneghits, 1); - cache_negative_hit(ncp); if (ncp->nc_flag & NCF_WHITE) cnp->cn_flags |= ISWHITEOUT; SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name); cache_out_ts(ncp, tsp, ticksp); - cache_lookup_unlock(blp, dvlp); + if (doing_smr) + vfs_smr_exit(); + else + cache_lookup_unlock(blp, dvlp); return (ENOENT); zap_and_exit: + MPASS(!doing_smr); if (blp != NULL) error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp); else @@ -1775,7 +1846,7 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, * the same path name. */ ncpp = NCHHASH(hash); - LIST_FOREACH(n2, ncpp, nc_hash) { + CK_LIST_FOREACH(n2, ncpp, nc_hash) { if (n2->nc_dvp == dvp && n2->nc_nlen == cnp->cn_namelen && !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) { @@ -1840,12 +1911,6 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, } /* - * Insert the new namecache entry into the appropriate chain - * within the cache entries table. - */ - LIST_INSERT_HEAD(ncpp, ncp, nc_hash); - - /* * If the entry is "negative", we place it into the * "negative" cache queue, otherwise, we place it into the * destination vnode's cache entries queue. @@ -1861,6 +1926,14 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, SDT_PROBE2(vfs, namecache, enter_negative, done, dvp, ncp->nc_name); } + + atomic_thread_fence_rel(); + /* + * Insert the new namecache entry into the appropriate chain + * within the cache entries table. + */ + CK_LIST_INSERT_HEAD(ncpp, ncp, nc_hash); + cache_enter_unlock(&cel); if (numneg * ncnegfactor > lnumcache) cache_negative_zap_one(); @@ -1908,6 +1981,11 @@ nchinit(void *dummy __unused) NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts), UMA_ZONE_ZINIT); + VFS_SMR_ZONE_SET(cache_zone_small); + VFS_SMR_ZONE_SET(cache_zone_small_ts); + VFS_SMR_ZONE_SET(cache_zone_large); + VFS_SMR_ZONE_SET(cache_zone_large_ts); + ncsize = desiredvnodes * ncsizefactor; nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash); ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1; @@ -1973,11 +2051,11 @@ cache_changesize(u_long newmaxvnodes) nchashtbl = new_nchashtbl; nchash = new_nchash; for (i = 0; i <= old_nchash; i++) { - while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) { + while ((ncp = CK_LIST_FIRST(&old_nchashtbl[i])) != NULL) { hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen, ncp->nc_dvp); - LIST_REMOVE(ncp, nc_hash); - LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); + CK_LIST_REMOVE(ncp, nc_hash); + CK_LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash); } } ncsize = newncsize; @@ -2091,7 +2169,7 @@ cache_purgevfs(struct mount *mp, bool force) for (j = i; j < n_nchash; j += numbucketlocks) { retry: bucket = &nchashtbl[j]; - LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { + CK_LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) { cache_assert_bucket_locked(ncp, RA_WLOCKED); if (ncp->nc_dvp->v_mount != mp) continue;