From owner-dev-commits-src-main@freebsd.org Sun Dec 27 17:23:01 2020 Return-Path: Delivered-To: dev-commits-src-main@mailman.nyi.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1]) by mailman.nyi.freebsd.org (Postfix) with ESMTP id 38CCD4C074D; Sun, 27 Dec 2020 17:23:01 +0000 (UTC) (envelope-from git@FreeBSD.org) Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org [IPv6:2610:1c1:1:606c::19:3]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256 client-signature RSA-PSS (4096 bits) client-digest SHA256) (Client CN "mxrelay.nyi.freebsd.org", Issuer "Let's Encrypt Authority X3" (verified OK)) by mx1.freebsd.org (Postfix) with ESMTPS id 4D3nZF19xlz4rL0; Sun, 27 Dec 2020 17:23:01 +0000 (UTC) (envelope-from git@FreeBSD.org) Received: from gitrepo.freebsd.org (gitrepo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:5]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits) server-digest SHA256) (Client did not present a certificate) by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id 1B55B2F8B; Sun, 27 Dec 2020 17:23:01 +0000 (UTC) (envelope-from git@FreeBSD.org) Received: from gitrepo.freebsd.org ([127.0.1.44]) by gitrepo.freebsd.org (8.16.1/8.16.1) with ESMTP id 0BRHN1KR036143; Sun, 27 Dec 2020 17:23:01 GMT (envelope-from git@gitrepo.freebsd.org) Received: (from git@localhost) by gitrepo.freebsd.org (8.16.1/8.16.1/Submit) id 0BRHN1QR036142; Sun, 27 Dec 2020 17:23:01 GMT (envelope-from git) Date: Sun, 27 Dec 2020 17:23:01 GMT Message-Id: <202012271723.0BRHN1QR036142@gitrepo.freebsd.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org From: Mateusz Guzik Subject: git: 6dbb07ed6872 - main - cache: modification and last entry filling support in lockless lookup MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit X-Git-Committer: mjg X-Git-Repository: src X-Git-Refname: refs/heads/main X-Git-Reftype: branch X-Git-Commit: 6dbb07ed6872ae7988b9b705e322c94658eba6d1 Auto-Submitted: auto-generated X-BeenThere: dev-commits-src-main@freebsd.org X-Mailman-Version: 2.1.34 Precedence: list List-Id: "Commit messages for the main branch of the src repository." List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sun, 27 Dec 2020 17:23:01 -0000 The branch main has been updated by mjg: URL: https://cgit.FreeBSD.org/src/commit/?id=6dbb07ed6872ae7988b9b705e322c94658eba6d1 commit 6dbb07ed6872ae7988b9b705e322c94658eba6d1 Author: Mateusz Guzik AuthorDate: 2020-12-27 15:28:14 +0000 Commit: Mateusz Guzik CommitDate: 2020-12-27 17:22:25 +0000 cache: modification and last entry filling support in lockless lookup Tested by: pho (previous version) --- sys/kern/vfs_cache.c | 297 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 281 insertions(+), 16 deletions(-) diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index 38121893126e..7b7149a15e92 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -3724,6 +3724,13 @@ cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) #define cache_fpl_handled(x, e) cache_fpl_handled_impl((x), (e), __LINE__) +static bool +cache_fpl_terminated(struct cache_fpl *fpl) +{ + + return (fpl->status != CACHE_FPL_STATUS_UNSET); +} + #define CACHE_FPL_SUPPORTED_CN_FLAGS \ (NC_NOMAKEENTRY | NC_KEEPPOSENTRY | LOCKLEAF | LOCKPARENT | WANTPARENT | \ FOLLOW | LOCKSHARED | SAVENAME | SAVESTART | WILLBEDIR | ISOPEN | \ @@ -3735,6 +3742,8 @@ cache_fpl_handled_impl(struct cache_fpl *fpl, int error, int line) _Static_assert((CACHE_FPL_SUPPORTED_CN_FLAGS & CACHE_FPL_INTERNAL_CN_FLAGS) == 0, "supported and internal flags overlap"); +static bool cache_fplookup_need_climb_mount(struct cache_fpl *fpl); + static bool cache_fpl_islastcn(struct nameidata *ndp) { @@ -3857,6 +3866,16 @@ cache_fplookup_partial_setup(struct cache_fpl *fpl) return (cache_fpl_aborted(fpl)); } + /* + * Note that seqc is checked before the vnode is locked, so by + * the time regular lookup gets to it it may have moved. + * + * Ultimately this does not affect correctness, any lookup errors + * are userspace racing with itself. It is guaranteed that any + * path which ultimatley gets found could also have been found + * by regular lookup going all the way in absence of concurrent + * modifications. + */ dvs = vget_prep_smr(dvp); cache_fpl_smr_exit(fpl); if (__predict_false(dvs == VGET_NONE)) { @@ -3874,6 +3893,7 @@ cache_fplookup_partial_setup(struct cache_fpl *fpl) cache_fpl_restore_partial(fpl, &fpl->snd); ndp->ni_startdir = dvp; + MPASS((cnp->cn_flags & MAKEENTRY) == 0); cnp->cn_flags |= MAKEENTRY; if (cache_fpl_islastcn(ndp)) cnp->cn_flags |= ISLASTCN; @@ -3920,18 +3940,159 @@ cache_fplookup_final_child(struct cache_fpl *fpl, enum vgetstate tvs) /* * They want to possibly modify the state of the namecache. - * - * Don't try to match the API contract, just leave. - * TODO: this leaves scalability on the table */ -static int +static int __noinline cache_fplookup_final_modifying(struct cache_fpl *fpl) { + struct nameidata *ndp; struct componentname *cnp; + enum vgetstate dvs; + struct vnode *dvp, *tvp; + struct mount *mp; + seqc_t dvp_seqc; + int error; + bool docache; + ndp = fpl->ndp; cnp = fpl->cnp; - MPASS(cnp->cn_nameiop != LOOKUP); - return (cache_fpl_partial(fpl)); + dvp = fpl->dvp; + dvp_seqc = fpl->dvp_seqc; + + MPASS(cache_fpl_islastcn(ndp)); + if ((cnp->cn_flags & LOCKPARENT) == 0) + MPASS((cnp->cn_flags & WANTPARENT) != 0); + MPASS((cnp->cn_flags & TRAILINGSLASH) == 0); + MPASS(cnp->cn_nameiop == CREATE || cnp->cn_nameiop == DELETE || + cnp->cn_nameiop == RENAME); + + docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; + if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME) + docache = false; + + mp = atomic_load_ptr(&dvp->v_mount); + if (__predict_false(mp == NULL)) { + return (cache_fpl_aborted(fpl)); + } + + if (__predict_false(mp->mnt_flag & MNT_RDONLY)) { + cache_fpl_smr_exit(fpl); + /* + * Original code keeps not checking for CREATE which + * might be a bug. For now let the old lookup decide. + */ + if (cnp->cn_nameiop == CREATE) { + return (cache_fpl_aborted(fpl)); + } + return (cache_fpl_handled(fpl, EROFS)); + } + + /* + * Secure access to dvp; check cache_fplookup_partial_setup for + * reasoning. + * + * XXX At least UFS requires its lookup routine to be called for + * the last path component, which leads to some level of complicaton + * and inefficiency: + * - the target routine always locks the target vnode, but our caller + * may not need it locked + * - some of the VOP machinery asserts that the parent is locked, which + * once more may be not required + * + * TODO: add a flag for filesystems which don't need this. + */ + dvs = vget_prep_smr(dvp); + cache_fpl_smr_exit(fpl); + if (__predict_false(dvs == VGET_NONE)) { + return (cache_fpl_aborted(fpl)); + } + + vget_finish_ref(dvp, dvs); + if (!vn_seqc_consistent(dvp, dvp_seqc)) { + vrele(dvp); + return (cache_fpl_aborted(fpl)); + } + + error = vn_lock(dvp, LK_EXCLUSIVE); + if (__predict_false(error != 0)) { + vrele(dvp); + return (cache_fpl_aborted(fpl)); + } + + tvp = NULL; + MPASS((cnp->cn_flags & MAKEENTRY) == 0); + if (docache) + cnp->cn_flags |= MAKEENTRY; + cnp->cn_flags |= ISLASTCN; + cnp->cn_lkflags = LK_EXCLUSIVE; + error = VOP_LOOKUP(dvp, &tvp, cnp); + switch (error) { + case EJUSTRETURN: + case 0: + break; + case ENOTDIR: + case ENOENT: + vput(dvp); + return (cache_fpl_handled(fpl, error)); + default: + vput(dvp); + return (cache_fpl_aborted(fpl)); + } + + fpl->tvp = tvp; + + if (tvp == NULL) { + if ((cnp->cn_flags & SAVESTART) != 0) { + ndp->ni_startdir = dvp; + vrefact(ndp->ni_startdir); + cnp->cn_flags |= SAVENAME; + } + MPASS(error == EJUSTRETURN); + if ((cnp->cn_flags & LOCKPARENT) == 0) { + VOP_UNLOCK(dvp); + } + return (cache_fpl_handled(fpl, 0)); + } + + /* + * Check if the target is either a symlink or a mount point. + * Since we expect this to be the terminal vnode it should + * almost never be true. + */ + if (__predict_false(!cache_fplookup_vnode_supported(tvp) || + cache_fplookup_need_climb_mount(fpl))) { + vput(dvp); + vput(tvp); + return (cache_fpl_aborted(fpl)); + } + + if ((cnp->cn_flags & LOCKLEAF) == 0) { + VOP_UNLOCK(tvp); + } + + if ((cnp->cn_flags & LOCKPARENT) == 0) { + VOP_UNLOCK(dvp); + } + + if ((cnp->cn_flags & SAVESTART) != 0) { + ndp->ni_startdir = dvp; + vrefact(ndp->ni_startdir); + cnp->cn_flags |= SAVENAME; + } + + return (cache_fpl_handled(fpl, 0)); +} + +static int __noinline +cache_fplookup_modifying(struct cache_fpl *fpl) +{ + struct nameidata *ndp; + + ndp = fpl->ndp; + + if (!cache_fpl_islastcn(ndp)) { + return (cache_fpl_partial(fpl)); + } + return (cache_fplookup_final_modifying(fpl)); } static int __noinline @@ -4012,8 +4173,6 @@ cache_fplookup_final(struct cache_fpl *fpl) dvp_seqc = fpl->dvp_seqc; tvp = fpl->tvp; - VNPASS(cache_fplookup_vnode_supported(dvp), dvp); - if (cnp->cn_nameiop != LOOKUP) { return (cache_fplookup_final_modifying(fpl)); } @@ -4036,6 +4195,117 @@ cache_fplookup_final(struct cache_fpl *fpl) return (cache_fplookup_final_child(fpl, tvs)); } +static int __noinline +cache_fplookup_noentry(struct cache_fpl *fpl) +{ + struct nameidata *ndp; + struct componentname *cnp; + enum vgetstate dvs; + struct vnode *dvp, *tvp; + seqc_t dvp_seqc; + int error; + bool docache; + + ndp = fpl->ndp; + cnp = fpl->cnp; + dvp = fpl->dvp; + dvp_seqc = fpl->dvp_seqc; + + if (cnp->cn_nameiop != LOOKUP) { + return (cache_fplookup_modifying(fpl)); + } + + MPASS((cnp->cn_flags & SAVESTART) == 0); + + /* + * Only try to fill in the component if it is the last one, + * otherwise not only there may be several to handle but the + * walk may be complicated. + */ + if (!cache_fpl_islastcn(ndp)) { + return (cache_fpl_partial(fpl)); + } + + /* + * Secure access to dvp; check cache_fplookup_partial_setup for + * reasoning. + */ + dvs = vget_prep_smr(dvp); + cache_fpl_smr_exit(fpl); + if (__predict_false(dvs == VGET_NONE)) { + return (cache_fpl_aborted(fpl)); + } + + vget_finish_ref(dvp, dvs); + if (!vn_seqc_consistent(dvp, dvp_seqc)) { + vrele(dvp); + return (cache_fpl_aborted(fpl)); + } + + error = vn_lock(dvp, LK_SHARED); + if (__predict_false(error != 0)) { + vrele(dvp); + return (cache_fpl_aborted(fpl)); + } + + tvp = NULL; + /* + * TODO: provide variants which don't require locking either vnode. + */ + docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; + MPASS((cnp->cn_flags & MAKEENTRY) == 0); + if (docache) + cnp->cn_flags |= MAKEENTRY; + cnp->cn_flags |= ISLASTCN; + cnp->cn_lkflags = LK_SHARED; + if ((cnp->cn_flags & LOCKSHARED) == 0) { + cnp->cn_lkflags = LK_EXCLUSIVE; + } + error = VOP_LOOKUP(dvp, &tvp, cnp); + switch (error) { + case EJUSTRETURN: + case 0: + break; + case ENOTDIR: + case ENOENT: + vput(dvp); + return (cache_fpl_handled(fpl, error)); + default: + vput(dvp); + return (cache_fpl_aborted(fpl)); + } + + fpl->tvp = tvp; + + if (tvp == NULL) { + MPASS(error == EJUSTRETURN); + if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { + vput(dvp); + } else if ((cnp->cn_flags & LOCKPARENT) == 0) { + VOP_UNLOCK(dvp); + } + return (cache_fpl_handled(fpl, 0)); + } + + if (__predict_false(!cache_fplookup_vnode_supported(tvp) || + cache_fplookup_need_climb_mount(fpl))) { + vput(dvp); + vput(tvp); + return (cache_fpl_aborted(fpl)); + } + + if ((cnp->cn_flags & LOCKLEAF) == 0) { + VOP_UNLOCK(tvp); + } + + if ((cnp->cn_flags & (WANTPARENT | LOCKPARENT)) == 0) { + vput(dvp); + } else if ((cnp->cn_flags & LOCKPARENT) == 0) { + VOP_UNLOCK(dvp); + } + return (cache_fpl_handled(fpl, 0)); +} + static int __noinline cache_fplookup_dot(struct cache_fpl *fpl) { @@ -4184,13 +4454,8 @@ cache_fplookup_next(struct cache_fpl *fpl) break; } - /* - * If there is no entry we have to punt to the slow path to perform - * actual lookup. Should there be nothing with this name a negative - * entry will be created. - */ if (__predict_false(ncp == NULL)) { - return (cache_fpl_partial(fpl)); + return (cache_fplookup_noentry(fpl)); } tvp = atomic_load_ptr(&ncp->nc_vp); @@ -4539,12 +4804,12 @@ cache_fplookup_impl(struct vnode *dvp, struct cache_fpl *fpl) if (__predict_false(cache_fpl_isdotdot(cnp))) { error = cache_fplookup_dotdot(fpl); - if (__predict_false(error != 0)) { + if (__predict_false(cache_fpl_terminated(fpl))) { break; } } else { error = cache_fplookup_next(fpl); - if (__predict_false(error != 0)) { + if (__predict_false(cache_fpl_terminated(fpl))) { break; }