Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 22 Nov 2014 22:11:47 +0100
From:      Mateusz Guzik <mjguzik@gmail.com>
To:        Konstantin Belousov <kostikbel@gmail.com>
Cc:        freebsd-fs@freebsd.org
Subject:   Re: atomic v_usecount and v_holdcnt
Message-ID:  <20141122211147.GA23623@dft-labs.eu>
In-Reply-To: <20141122092527.GT17068@kib.kiev.ua>
References:  <20141122002812.GA32289@dft-labs.eu> <20141122092527.GT17068@kib.kiev.ua>

next in thread | previous in thread | raw e-mail | index | archive | help
On Sat, Nov 22, 2014 at 11:25:27AM +0200, Konstantin Belousov wrote:
> On Sat, Nov 22, 2014 at 01:28:12AM +0100, Mateusz Guzik wrote:
> > The idea is that we don't need an interlock as long as we don't
> > transition either counter 1->0 or 0->1.
> I already said that something along the lines of the patch should work.
> In fact, you need vnode lock when hold count changes between 0 and 1,
> and probably the same for use count.
> 

I don't see why this would be required (not that I'm an VFS expert).
vnode recycling seems to be protected with the interlock.

In fact I would argue that if this is really needed, current code is
buggy.

interlock is taken in e.g. vgone with vnode already locked, so for cases
where we get interlock -> lock, the kernel has to drop the former before
blocking in order to avoid deadlocks. And this opens the same window
present with my patch.

> Some notes about the patch.
> 
> mtx_owned() braces are untolerable ugliness. You should either pass a
> boolean flag (preferred), or create locked/unlocked versions of the
> functions.
> 

That was a temporary hack.

> Similarly, I dislike vget_held().  Add a flag to vget(), see LK_EATTR_MASK
> in sys/lockmgr.h.
> 

lockmgr has no business knowing or not knowing whether we held the
vnode, so the flag would have to be cleared before it is passed to it.
But then it seems like an abuse of LK_* namespace.

But maybe I misunerstood your proposal.

> Could there be consequences of not taking vnode interlock and passing
> LK_INTERLOCK to vn_lock() in vget() ?
> 

You mean to add an assertion? I did it in the new patch.

> Taking interlock when vnode lock is already owned is probably fine and
> does not add to contention. I mean that making VI_OWEINACT so loose
> breaks the VOP_INACTIVE() contract.

namecache typically locks vnodes shared and in such cases vinactive is
not executed and only OWEINACT is cleared.

And it is a serialisation point. I just tested with multiple stats in
the same directory and it went from ~1100000 to ~620000 ops/s.

I can add unconditional locking of the interlock for exclusively locked
vnodes if you insist, bur for shared ones I don't see any benefit.

Patch below should be split in 3, but imho is sufficinetly readable for
review in one batch for review.

1. It adds refcount_{acquire,release}_if_greater functions to replace
open coded atomic_cmpset_int loops.

2. v_rdev->si_usecount manipulation is moved to v_{incr,decr}_devcount.

3. actual switch to atomics + some assertions

diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
index 83f29c1..b587ebd 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
@@ -99,6 +99,6 @@ vn_rele_async(vnode_t *vp, taskq_t *taskq)
 		    (task_func_t *)vn_rele_inactive, vp, TQ_SLEEP) != 0);
 		return;
 	}
-	vp->v_usecount--;
+	refcount_release(&vp->v_usecount);
 	vdropl(vp);
 }
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
index 55e3217..50a84d8 100644
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@@ -665,12 +665,12 @@ success:
 		ltype = VOP_ISLOCKED(dvp);
 		VOP_UNLOCK(dvp, 0);
 	}
-	VI_LOCK(*vpp);
+	vhold(*vpp);
 	if (wlocked)
 		CACHE_WUNLOCK();
 	else
 		CACHE_RUNLOCK();
-	error = vget(*vpp, cnp->cn_lkflags | LK_INTERLOCK, cnp->cn_thread);
+	error = vget_held(*vpp, cnp->cn_lkflags, cnp->cn_thread);
 	if (cnp->cn_flags & ISDOTDOT) {
 		vn_lock(dvp, ltype | LK_RETRY);
 		if (dvp->v_iflag & VI_DOOMED) {
@@ -1376,9 +1376,9 @@ vn_dir_dd_ino(struct vnode *vp)
 		if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
 			continue;
 		ddvp = ncp->nc_dvp;
-		VI_LOCK(ddvp);
+		vhold(ddvp);
 		CACHE_RUNLOCK();
-		if (vget(ddvp, LK_INTERLOCK | LK_SHARED | LK_NOWAIT, curthread))
+		if (vget_held(ddvp, LK_SHARED | LK_NOWAIT, curthread))
 			return (NULL);
 		return (ddvp);
 	}
diff --git a/sys/kern/vfs_hash.c b/sys/kern/vfs_hash.c
index 0271e49..d2fdbba 100644
--- a/sys/kern/vfs_hash.c
+++ b/sys/kern/vfs_hash.c
@@ -83,9 +83,9 @@ vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td, s
 				continue;
 			if (fn != NULL && fn(vp, arg))
 				continue;
-			VI_LOCK(vp);
+			vhold(vp);
 			mtx_unlock(&vfs_hash_mtx);
-			error = vget(vp, flags | LK_INTERLOCK, td);
+			error = vget_held(vp, flags, td);
 			if (error == ENOENT && (flags & LK_NOWAIT) == 0)
 				break;
 			if (error)
@@ -127,9 +127,9 @@ vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, stru
 				continue;
 			if (fn != NULL && fn(vp2, arg))
 				continue;
-			VI_LOCK(vp2);
+			vhold(vp2);
 			mtx_unlock(&vfs_hash_mtx);
-			error = vget(vp2, flags | LK_INTERLOCK, td);
+			error = vget_held(vp2, flags, td);
 			if (error == ENOENT && (flags & LK_NOWAIT) == 0)
 				break;
 			mtx_lock(&vfs_hash_mtx);
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 345aad6..564b5af 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -68,6 +68,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/pctrie.h>
 #include <sys/priv.h>
 #include <sys/reboot.h>
+#include <sys/refcount.h>
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/sleepqueue.h>
@@ -105,6 +106,8 @@ static void	v_incr_usecount(struct vnode *);
 static void	v_decr_usecount(struct vnode *);
 static void	v_decr_useonly(struct vnode *);
 static void	v_upgrade_usecount(struct vnode *);
+static void	v_incr_devcount(struct vnode *);
+static void	v_decr_devcount(struct vnode *);
 static void	vnlru_free(int);
 static void	vgonel(struct vnode *);
 static void	vfs_knllock(void *arg);
@@ -165,6 +168,10 @@ static int reassignbufcalls;
 SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
     "Number of calls to reassignbuf");
 
+static int vget_lock;
+SYSCTL_INT(_vfs, OID_AUTO, vget_lock, CTLFLAG_RW, &vget_lock, 0,
+    "Lock the interlock unconditionally in vget");
+
 /*
  * Cache for the mount type id assigned to NFS.  This is used for
  * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
@@ -854,7 +861,7 @@ vnlru_free(int count)
 		 */
 		freevnodes--;
 		vp->v_iflag &= ~VI_FREE;
-		vp->v_holdcnt++;
+		refcount_acquire(&vp->v_holdcnt);
 
 		mtx_unlock(&vnode_free_list_mtx);
 		VI_UNLOCK(vp);
@@ -2050,14 +2057,10 @@ static void
 v_incr_usecount(struct vnode *vp)
 {
 
+	ASSERT_VI_UNLOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
-	vholdl(vp);
-	vp->v_usecount++;
-	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
-		dev_lock();
-		vp->v_rdev->si_usecount++;
-		dev_unlock();
-	}
+	vhold(vp);
+	v_upgrade_usecount(vp);
 }
 
 /*
@@ -2068,13 +2071,14 @@ static void
 v_upgrade_usecount(struct vnode *vp)
 {
 
+	ASSERT_VI_UNLOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
-	vp->v_usecount++;
-	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
-		dev_lock();
-		vp->v_rdev->si_usecount++;
-		dev_unlock();
+	if (!refcount_acquire_if_greater(&vp->v_usecount, 0)) {
+		VI_LOCK(vp);
+		refcount_acquire(&vp->v_usecount);
+		VI_UNLOCK(vp);
 	}
+	v_incr_devcount(vp);
 }
 
 /*
@@ -2086,16 +2090,11 @@ static void
 v_decr_usecount(struct vnode *vp)
 {
 
-	ASSERT_VI_LOCKED(vp, __FUNCTION__);
+	ASSERT_VI_LOCKED(vp, __func__);
 	VNASSERT(vp->v_usecount > 0, vp,
 	    ("v_decr_usecount: negative usecount"));
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
-	vp->v_usecount--;
-	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
-		dev_lock();
-		vp->v_rdev->si_usecount--;
-		dev_unlock();
-	}
+	v_decr_useonly(vp);
 	vdropl(vp);
 }
 
@@ -2109,11 +2108,35 @@ static void
 v_decr_useonly(struct vnode *vp)
 {
 
-	ASSERT_VI_LOCKED(vp, __FUNCTION__);
+	ASSERT_VI_LOCKED(vp, __func__);
 	VNASSERT(vp->v_usecount > 0, vp,
 	    ("v_decr_useonly: negative usecount"));
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
-	vp->v_usecount--;
+	refcount_release(&vp->v_usecount);
+	v_decr_devcount(vp);
+}
+
+/*
+ * Increment si_usecount of the associated device, if any.
+ */
+static void
+v_incr_devcount(struct vnode *vp)
+{
+
+	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
+		dev_lock();
+		vp->v_rdev->si_usecount++;
+		dev_unlock();
+	}
+}
+
+/*
+ * Increment si_usecount of the associated device, if any.
+ */
+static void
+v_decr_devcount(struct vnode *vp)
+{
+
 	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
 		dev_lock();
 		vp->v_rdev->si_usecount--;
@@ -2129,19 +2152,19 @@ v_decr_useonly(struct vnode *vp)
  * vput try to do it here.
  */
 int
-vget(struct vnode *vp, int flags, struct thread *td)
+vget_held(struct vnode *vp, int flags, struct thread *td)
 {
 	int error;
 
-	error = 0;
 	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
 	    ("vget: invalid lock operation"));
+	if ((flags & LK_INTERLOCK) != 0)
+		ASSERT_VI_LOCKED(vp, __func__);
+	else
+		ASSERT_VI_UNLOCKED(vp, __func__);
 	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
 
-	if ((flags & LK_INTERLOCK) == 0)
-		VI_LOCK(vp);
-	vholdl(vp);
-	if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) {
+	if ((error = vn_lock(vp, flags)) != 0) {
 		vdrop(vp);
 		CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
 		    vp);
@@ -2149,7 +2172,6 @@ vget(struct vnode *vp, int flags, struct thread *td)
 	}
 	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
 		panic("vget: vn_lock failed to return ENOENT\n");
-	VI_LOCK(vp);
 	/* Upgrade our holdcnt to a usecount. */
 	v_upgrade_usecount(vp);
 	/*
@@ -2158,16 +2180,27 @@ vget(struct vnode *vp, int flags, struct thread *td)
 	 * here at preventing a reference to a removed file.  If
 	 * we don't succeed no harm is done.
 	 */
-	if (vp->v_iflag & VI_OWEINACT) {
-		if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
-		    (flags & LK_NOWAIT) == 0)
-			vinactive(vp, td);
-		vp->v_iflag &= ~VI_OWEINACT;
+	if (vget_lock || vp->v_iflag & VI_OWEINACT) {
+		VI_LOCK(vp);
+		if (vp->v_iflag & VI_OWEINACT) {
+			if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
+			    (flags & LK_NOWAIT) == 0)
+				vinactive(vp, td);
+			vp->v_iflag &= ~VI_OWEINACT;
+		}
+		VI_UNLOCK(vp);
 	}
-	VI_UNLOCK(vp);
 	return (0);
 }
 
+int
+vget(struct vnode *vp, int flags, struct thread *td)
+{
+
+	_vhold(vp, (flags & LK_INTERLOCK) != 0);
+	return (vget_held(vp, flags, td));
+}
+
 /*
  * Increase the reference count of a vnode.
  */
@@ -2176,9 +2209,7 @@ vref(struct vnode *vp)
 {
 
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
-	VI_LOCK(vp);
 	v_incr_usecount(vp);
-	VI_UNLOCK(vp);
 }
 
 /*
@@ -2193,13 +2224,8 @@ vref(struct vnode *vp)
 int
 vrefcnt(struct vnode *vp)
 {
-	int usecnt;
-
-	VI_LOCK(vp);
-	usecnt = vp->v_usecount;
-	VI_UNLOCK(vp);
 
-	return (usecnt);
+	return (vp->v_usecount);
 }
 
 #define	VPUTX_VRELE	1
@@ -2218,12 +2244,19 @@ vputx(struct vnode *vp, int func)
 		ASSERT_VOP_LOCKED(vp, "vput");
 	else
 		KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
+	ASSERT_VI_UNLOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+
+	if (refcount_release_if_greater(&vp->v_usecount, 1)) {
+		if (func == VPUTX_VPUT)
+			VOP_UNLOCK(vp, 0);
+		v_decr_devcount(vp);
+		vdrop(vp);
+		return;
+	}
+
 	VI_LOCK(vp);
 
-	/* Skip this v_writecount check if we're going to panic below. */
-	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
-	    ("vputx: missed vn_close"));
 	error = 0;
 
 	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
@@ -2314,38 +2347,32 @@ vunref(struct vnode *vp)
 }
 
 /*
- * Somebody doesn't want the vnode recycled.
- */
-void
-vhold(struct vnode *vp)
-{
-
-	VI_LOCK(vp);
-	vholdl(vp);
-	VI_UNLOCK(vp);
-}
-
-/*
  * Increase the hold count and activate if this is the first reference.
  */
 void
-vholdl(struct vnode *vp)
+_vhold(struct vnode *vp, bool locked)
 {
 	struct mount *mp;
 
+	if (locked)
+		ASSERT_VI_LOCKED(vp, __func__);
+	else
+		ASSERT_VI_UNLOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
-#ifdef INVARIANTS
-	/* getnewvnode() calls v_incr_usecount() without holding interlock. */
-	if (vp->v_type != VNON || vp->v_data != NULL) {
-		ASSERT_VI_LOCKED(vp, "vholdl");
-		VNASSERT(vp->v_holdcnt > 0 || (vp->v_iflag & VI_FREE) != 0,
-		    vp, ("vholdl: free vnode is held"));
+	if (refcount_acquire_if_greater(&vp->v_holdcnt, 0)) {
+		VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
+		    ("_vhold: vnode with holdcnt is free"));
+		return;
 	}
-#endif
-	vp->v_holdcnt++;
-	if ((vp->v_iflag & VI_FREE) == 0)
+	if (!locked)
+		VI_LOCK(vp);
+	if ((vp->v_iflag & VI_FREE) == 0) {
+		refcount_acquire(&vp->v_holdcnt);
+		if (!locked)
+			VI_UNLOCK(vp);
 		return;
-	VNASSERT(vp->v_holdcnt == 1, vp, ("vholdl: wrong hold count"));
+	}
+	VNASSERT(vp->v_holdcnt == 0, vp, ("vholdl: wrong hold count"));
 	VNASSERT(vp->v_op != NULL, vp, ("vholdl: vnode already reclaimed."));
 	/*
 	 * Remove a vnode from the free list, mark it as in use,
@@ -2362,18 +2389,9 @@ vholdl(struct vnode *vp)
 	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
 	mp->mnt_activevnodelistsize++;
 	mtx_unlock(&vnode_free_list_mtx);
-}
-
-/*
- * Note that there is one less who cares about this vnode.
- * vdrop() is the opposite of vhold().
- */
-void
-vdrop(struct vnode *vp)
-{
-
-	VI_LOCK(vp);
-	vdropl(vp);
+	refcount_acquire(&vp->v_holdcnt);
+	if (!locked)
+		VI_UNLOCK(vp);
 }
 
 /*
@@ -2382,20 +2400,28 @@ vdrop(struct vnode *vp)
  * (marked VI_DOOMED) in which case we will free it.
  */
 void
-vdropl(struct vnode *vp)
+_vdrop(struct vnode *vp, bool locked)
 {
 	struct bufobj *bo;
 	struct mount *mp;
 	int active;
 
-	ASSERT_VI_LOCKED(vp, "vdropl");
+	if (locked)
+		ASSERT_VI_LOCKED(vp, __func__);
+	else
+		ASSERT_VI_UNLOCKED(vp, __func__);
 	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
 	if (vp->v_holdcnt <= 0)
 		panic("vdrop: holdcnt %d", vp->v_holdcnt);
-	vp->v_holdcnt--;
-	VNASSERT(vp->v_holdcnt >= vp->v_usecount, vp,
-	    ("hold count less than use count"));
-	if (vp->v_holdcnt > 0) {
+	if (refcount_release_if_greater(&vp->v_holdcnt, 1)) {
+		if (locked)
+			VI_UNLOCK(vp);
+		return;
+	}
+
+	if (!locked)
+		VI_LOCK(vp);
+	if (refcount_release(&vp->v_holdcnt) == 0) {
 		VI_UNLOCK(vp);
 		return;
 	}
diff --git a/sys/sys/refcount.h b/sys/sys/refcount.h
index 4611664..360d50d 100644
--- a/sys/sys/refcount.h
+++ b/sys/sys/refcount.h
@@ -64,4 +64,32 @@ refcount_release(volatile u_int *count)
 	return (old == 1);
 }
 
+static __inline int
+refcount_acquire_if_greater(volatile u_int *count, int val)
+{
+	int old;
+retry:
+	old = *count;
+	if (old > val) {
+		if (atomic_cmpset_int(count, old, old + 1))
+			return (true);
+		goto retry;
+	}
+	return (false);
+}
+
+static __inline int
+refcount_release_if_greater(volatile u_int *count, int val)
+{
+	int old;
+retry:
+	old = *count;
+	if (old > val) {
+		if (atomic_cmpset_int(count, old, old - 1))
+			return (true);
+		goto retry;
+	}
+	return (false);
+}
+
 #endif	/* ! __SYS_REFCOUNT_H__ */
diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h
index c78b9d1..2aab48a 100644
--- a/sys/sys/vnode.h
+++ b/sys/sys/vnode.h
@@ -647,13 +647,16 @@ int	vaccess_acl_posix1e(enum vtype type, uid_t file_uid,
 	    struct ucred *cred, int *privused);
 void	vattr_null(struct vattr *vap);
 int	vcount(struct vnode *vp);
-void	vdrop(struct vnode *);
-void	vdropl(struct vnode *);
+#define	vdrop(vp)	_vdrop((vp), 0)
+#define	vdropl(vp)	_vdrop((vp), 1)
+void	_vdrop(struct vnode *, bool);
 int	vflush(struct mount *mp, int rootrefs, int flags, struct thread *td);
 int	vget(struct vnode *vp, int lockflag, struct thread *td);
+int	vget_held(struct vnode *vp, int lockflag, struct thread *td);
 void	vgone(struct vnode *vp);
-void	vhold(struct vnode *);
-void	vholdl(struct vnode *);
+#define	vhold(vp)	_vhold((vp), 0)
+#define	vholdl(vp)	_vhold((vp), 1)
+void	_vhold(struct vnode *, bool);
 void	vinactive(struct vnode *, struct thread *);
 int	vinvalbuf(struct vnode *vp, int save, int slpflag, int slptimeo);
 int	vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length,



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20141122211147.GA23623>