Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 22 Sep 2001 13:22:51 -0700 (PDT)
From:      Matt Dillon <dillon@earth.backplane.com>
To:        Seigo Tanimura <tanimura@r.dl.itc.u-tokyo.ac.jp>, phk@critter.freebsd.dk, tanimura@r.dl.itc.u-tokyo.ac.jp, bright@wintelcom.net
Cc:        hackers@freebsd.org
Subject:   More on the cache_purgeleafdirs() routine
Message-ID:  <200109222022.f8MKMpU81911@earth.backplane.com>
References:  <200104161024.f3GAO7Z34787@rina.r.dl.itc.u-tokyo.ac.jp> <1198.987417363@critter> <200104161223.f3GCNZZ51680@rina.r.dl.itc.u-tokyo.ac.jp>

next in thread | previous in thread | raw e-mail | index | archive | help

    Hi guys.  I've been tracking down issues with vnode recycling.. 
    specifically, getnewvnode() deadlocks in -stable in large-memory
    configurations.  I took a real good look at cache_purgeleafdirs() 
    in -current to see if it was MFCable as a solution.  But....

    There are a number of issues... well, there is really one big issue, and
    that is the simple fact that there can be upwards of 260,000+ entries
    in the name cache and cache_purgeleafdirs() doesn't scale.  It is an
    O(N*M) algorithm.  Any system that requires a great deal of vnode
    recycling -- for example Yahoo's userbase lookup (one file per userid)
    would be terribly impacted by this algorithm.

    It seems to me that the best way to deal with this is to simply have
    getnewvnode() zap the namei-cache, and have vfs_lookup() requeue
    (for LRU purposes) any namei cache associated vnodes that are on the
    freelist.  The patch is much less complex... here's is a preliminary
    patch (without the LRU requeueing, I haven't gotten to that yet, and 
    I haven't completely removed the non-scaleable recycling code).

    This code seems to work quite well even without the LRU requeueing,
    especialy if you turn on vfs.vmiodirenable.  I don't see any reason
    to be totally rabid about keeping top level directory entries in-core...
    at least not at the cost of the current algorithm in -current.  We 
    want to try to keep them in core, but I believe that LRU requeueing
    in vfs_lookup() may be sufficient for that... or possibly even something
    as simple as a flag in the namei caching structure that guarentees the
    first N directory levels are left in-core.  Or something like that. 
    But not a complete scan of the namei cache!

						    -Matt

Index: kern/vfs_cache.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_cache.c,v
retrieving revision 1.61
diff -u -r1.61 vfs_cache.c
--- kern/vfs_cache.c	2001/09/12 08:37:46	1.61
+++ kern/vfs_cache.c	2001/09/22 20:13:47
@@ -101,8 +101,10 @@
 SYSCTL_ULONG(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, "");
 static u_long	numcachehv;		/* number of cache entries with vnodes held */
 SYSCTL_ULONG(_debug, OID_AUTO, numcachehv, CTLFLAG_RD, &numcachehv, 0, "");
+#if 0
 static u_long	numcachepl;		/* number of cache purge for leaf entries */
 SYSCTL_ULONG(_debug, OID_AUTO, numcachepl, CTLFLAG_RD, &numcachepl, 0, "");
+#endif
 struct	nchstats nchstats;		/* cache effectiveness statistics */
 
 static int	doingcache = 1;		/* 1 => enable the cache */
@@ -476,6 +478,20 @@
 }
 
 /*
+ * Flush the namei cache references associated with a vnode.
+ * The vnode remains valid.
+ */
+void
+cache_flush(vp)
+	struct vnode *vp;
+{
+	while (!LIST_EMPTY(&vp->v_cache_src)) 
+		cache_zap(LIST_FIRST(&vp->v_cache_src));
+	while (!TAILQ_EMPTY(&vp->v_cache_dst)) 
+		cache_zap(TAILQ_FIRST(&vp->v_cache_dst));
+}
+
+/*
  * Flush all entries referencing a particular filesystem.
  *
  * Since we need to check it anyway, we will flush all the invalid
@@ -499,6 +515,8 @@
 	}
 }
 
+#if 0
+
 /*
  * Flush all dirctory entries with no child directories held in
  * the cache.
@@ -554,6 +572,8 @@
 	}
 	numcachepl++;
 }
+
+#endif
 
 /*
  * Perform canonical checks and cache lookup and pass on to filesystem
Index: kern/vfs_subr.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_subr.c,v
retrieving revision 1.319
diff -u -r1.319 vfs_subr.c
--- kern/vfs_subr.c	2001/09/12 08:37:47	1.319
+++ kern/vfs_subr.c	2001/09/22 20:15:11
@@ -110,6 +110,8 @@
 /* Number of vnodes in the free list. */
 static u_long freevnodes = 0;
 SYSCTL_LONG(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, "");
+
+#if 0
 /* Number of vnode allocation. */
 static u_long vnodeallocs = 0;
 SYSCTL_LONG(_debug, OID_AUTO, vnodeallocs, CTLFLAG_RD, &vnodeallocs, 0, "");
@@ -125,6 +127,7 @@
 /* Number of vnodes attempted to recycle at a time. */
 static u_long vnoderecyclenumber = 3000;
 SYSCTL_LONG(_debug, OID_AUTO, vnoderecyclenumber, CTLFLAG_RW, &vnoderecyclenumber, 0, "");
+#endif
 
 /*
  * Various variables used for debugging the new implementation of
@@ -556,8 +559,13 @@
 		 * Don't recycle if active in the namecache or
 		 * if it still has cached pages or we cannot get
 		 * its interlock.
+		 *
+		 * XXX the namei cache can hold onto vnodes too long,
+		 * causing us to run out of MALLOC space.  Instead, we 
+		 * should make path lookups requeue any vnodes on the free
+		 * list.
 		 */
-		if (LIST_FIRST(&vp->v_cache_src) != NULL ||
+		if (/* LIST_FIRST(&vp->v_cache_src) != NULL || */
 		    (VOP_GETVOBJECT(vp, &object) == 0 &&
 		     (object->resident_page_count || object->ref_count)) ||
 		    !mtx_trylock(&vp->v_interlock)) {
@@ -636,6 +644,7 @@
 
 	vfs_object_create(vp, td, td->td_proc->p_ucred);
 
+#if 0
 	vnodeallocs++;
 	if (vnodeallocs % vnoderecycleperiod == 0 &&
 	    freevnodes < vnoderecycleminfreevn &&
@@ -643,6 +652,7 @@
 		/* Recycle vnodes. */
 		cache_purgeleafdirs(vnoderecyclenumber);
 	}
+#endif
 
 	return (0);
 }
Index: sys/vnode.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/vnode.h,v
retrieving revision 1.157
diff -u -r1.157 vnode.h
--- sys/vnode.h	2001/09/13 22:52:42	1.157
+++ sys/vnode.h	2001/09/22 20:18:45
@@ -558,8 +558,8 @@
 int	cache_lookup __P((struct vnode *dvp, struct vnode **vpp,
 	    struct componentname *cnp));
 void	cache_purge __P((struct vnode *vp));
+void	cache_flush __P((struct vnode *vp));
 void	cache_purgevfs __P((struct mount *mp));
-void	cache_purgeleafdirs __P((int ndir));
 void	cvtstat __P((struct stat *st, struct ostat *ost));
 void	cvtnstat __P((struct stat *sb, struct nstat *nsb));
 int	getnewvnode __P((enum vtagtype tag,

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-hackers" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200109222022.f8MKMpU81911>