Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 19 Mar 2013 17:49:19 -0400 (EDT)
From:      Rick Macklem <rmacklem@uoguelph.ca>
To:        Jason Keltz <jas@cse.yorku.ca>
Cc:        freebsd-fs@freebsd.org
Subject:   Re: best freebsd version for zfs file server
Message-ID:  <312742115.4078123.1363729759052.JavaMail.root@erie.cs.uoguelph.ca>
In-Reply-To: <5148CB42.6090001@cse.yorku.ca>

next in thread | previous in thread | raw e-mail | index | archive | help

[-- Attachment #1 --]
Jason Keltz wrote:
> Hi.
> I hope to soon put into production a new file server hosting many ZFS
> filesystem with FreeBSD. The system has 2 x 9205-8e cards, and 1 x
> 9207-8i card and 24 x 900 GB 10K RPM drives. I'm trying to figure out
> what is ultimately the "best" version of FreeBSD to run on a
> production
> file server. I believe that it doesn't make sense to stick directly to
> the 9.1/release because there have already been many ZFS problems that
> were solved in 9.1/stable. On the other hand, stable doesn't
> necessarily have to be "stable"! Of course "release" might not be
> "stable" either if there's a bug that say, causes a hang on my
> controller card, and it's not fixed in anything but "stable"! Yet,
> "stable" might "break" something else. I'm wondering what people who
> are running FreeBSD file servers in production do -- do you track
> individual changes, and compile release + individual bug fixes that
> likely affect you, or, in my case, if I run "stable", do all my
> testing
> with "stable", do I run that version of stable, and only attempt to
> upgrade to the next "stable" release while very carefully reviewing
> the
> bug list, then holding my breath when the server comes up? Any
> recommendations would be appreciated. I know there are a lot of people
> who are happily running FreeBSD file servers. :)
> 
> Jason.
> 
You might want to consider the attached patch which Garrett Wollman has
been testing. It is not even in head yet, but earlier versions of the
patch have been in testing for a while.

It allows you to adjust tunables to trade increased storage use in the DRC
(mostly mbuf clusters) for decreased mutex lock contention and cpu overheads.

rick

> On 03/19/2013 03:04 PM, Dmitry Morozovsky wrote:
> > On Tue, 19 Mar 2013, Tom Evans wrote:
> >
> >>> I'm currently in process of making new backup server, based on LSI
> >>> 9260
> >>> controller. I'm planning to use ZFS over disks, hence the most
> >>> natural way
> >>> seems to configure mfi to JBOD mode - but I can't find easy way to
> >>> reach this,
> >>> neither in BIOS utilities nor via MegaCli
> >> 9260 should be SAS-2008 based, so mps(4) not mfi(4).
> > Well, it at least detected by stable/9 GENERIC as mfi
> >
> >> The internet[1] suggests that this card should be flashable to a
> >> 9211-8i with IT mode firmware, which is just about the ultimate ZFS
> >> card, instant-JBOD on inserting a disk, passthru for SMART, high
> >> performance, etc.
> > Will check, thanks for the reference.
> >
> >> [1]
> >> http://blog.grem.de/sysadmin/LSI-SAS2008-Flashing-2012-04-12-22-17.html
> >>
> 
> _______________________________________________
> freebsd-fs@freebsd.org mailing list
> http://lists.freebsd.org/mailman/listinfo/freebsd-fs
> To unsubscribe, send any mail to "freebsd-fs-unsubscribe@freebsd.org"

[-- Attachment #2 --]
--- fs/nfsserver/nfs_nfsdcache.c.orig	2013-01-07 09:04:13.000000000 -0500
+++ fs/nfsserver/nfs_nfsdcache.c	2013-03-12 22:42:05.000000000 -0400
@@ -160,12 +160,31 @@ __FBSDID("$FreeBSD: projects/nfsv4-packr
 #include <fs/nfs/nfsport.h>
 
 extern struct nfsstats newnfsstats;
-NFSCACHEMUTEX;
+extern struct mtx nfsrc_tcpmtx[NFSRVCACHE_HASHSIZE];
+extern struct mtx nfsrc_udpmtx;
 int nfsrc_floodlevel = NFSRVCACHE_FLOODLEVEL, nfsrc_tcpsavedreplies = 0;
 #endif	/* !APPLEKEXT */
 
-static int nfsrc_tcpnonidempotent = 1;
-static int nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER, nfsrc_udpcachesize = 0;
+SYSCTL_DECL(_vfs_nfsd);
+
+static u_int	nfsrc_tcphighwater = 0;
+SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcphighwater, CTLFLAG_RW,
+    &nfsrc_tcphighwater, 0,
+    "High water mark for TCP cache entries");
+static u_int	nfsrc_udphighwater = NFSRVCACHE_UDPHIGHWATER;
+SYSCTL_UINT(_vfs_nfsd, OID_AUTO, udphighwater, CTLFLAG_RW,
+    &nfsrc_udphighwater, 0,
+    "High water mark for UDP cache entries");
+static u_int	nfsrc_tcptimeout = NFSRVCACHE_TCPTIMEOUT;
+SYSCTL_UINT(_vfs_nfsd, OID_AUTO, tcpcachetimeo, CTLFLAG_RW,
+    &nfsrc_tcptimeout, 0,
+    "Timeout for TCP entries in the DRC");
+static u_int nfsrc_tcpnonidempotent = 1;
+SYSCTL_UINT(_vfs_nfsd, OID_AUTO, cachetcp, CTLFLAG_RW,
+    &nfsrc_tcpnonidempotent, 0,
+    "Enable the DRC for NFS over TCP");
+
+static int nfsrc_udpcachesize = 0;
 static TAILQ_HEAD(, nfsrvcache) nfsrvudplru;
 static struct nfsrvhashhead nfsrvhashtbl[NFSRVCACHE_HASHSIZE],
     nfsrvudphashtbl[NFSRVCACHE_HASHSIZE];
@@ -197,10 +216,11 @@ static int newnfsv2_procid[NFS_V3NPROCS]
 	NFSV2PROC_NOOP,
 };
 
+#define	nfsrc_hash(xid)	(((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE)
 #define	NFSRCUDPHASH(xid) \
-	(&nfsrvudphashtbl[((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE])
+	(&nfsrvudphashtbl[nfsrc_hash(xid)])
 #define	NFSRCHASH(xid) \
-	(&nfsrvhashtbl[((xid) + ((xid) >> 24)) % NFSRVCACHE_HASHSIZE])
+	(&nfsrvhashtbl[nfsrc_hash(xid)])
 #define	TRUE	1
 #define	FALSE	0
 #define	NFSRVCACHE_CHECKLEN	100
@@ -251,6 +271,18 @@ static int nfsrc_getlenandcksum(mbuf_t m
 static void nfsrc_marksametcpconn(u_int64_t);
 
 /*
+ * Return the correct mutex for this cache entry.
+ */
+static __inline struct mtx *
+nfsrc_cachemutex(struct nfsrvcache *rp)
+{
+
+	if ((rp->rc_flag & RC_UDP) != 0)
+		return (&nfsrc_udpmtx);
+	return (&nfsrc_tcpmtx[nfsrc_hash(rp->rc_xid)]);
+}
+
+/*
  * Initialize the server request cache list
  */
 APPLESTATIC void
@@ -325,10 +357,12 @@ nfsrc_getudp(struct nfsrv_descript *nd, 
 	struct sockaddr_in6 *saddr6;
 	struct nfsrvhashhead *hp;
 	int ret = 0;
+	struct mtx *mutex;
 
+	mutex = nfsrc_cachemutex(newrp);
 	hp = NFSRCUDPHASH(newrp->rc_xid);
 loop:
-	NFSLOCKCACHE();
+	mtx_lock(mutex);
 	LIST_FOREACH(rp, hp, rc_hash) {
 	    if (newrp->rc_xid == rp->rc_xid &&
 		newrp->rc_proc == rp->rc_proc &&
@@ -336,8 +370,8 @@ loop:
 		nfsaddr_match(NETFAMILY(rp), &rp->rc_haddr, nd->nd_nam)) {
 			if ((rp->rc_flag & RC_LOCKED) != 0) {
 				rp->rc_flag |= RC_WANTED;
-				(void)mtx_sleep(rp, NFSCACHEMUTEXPTR,
-				    (PZERO - 1) | PDROP, "nfsrc", 10 * hz);
+				(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
+				    "nfsrc", 10 * hz);
 				goto loop;
 			}
 			if (rp->rc_flag == 0)
@@ -347,14 +381,14 @@ loop:
 			TAILQ_INSERT_TAIL(&nfsrvudplru, rp, rc_lru);
 			if (rp->rc_flag & RC_INPROG) {
 				newnfsstats.srvcache_inproghits++;
-				NFSUNLOCKCACHE();
+				mtx_unlock(mutex);
 				ret = RC_DROPIT;
 			} else if (rp->rc_flag & RC_REPSTATUS) {
 				/*
 				 * V2 only.
 				 */
 				newnfsstats.srvcache_nonidemdonehits++;
-				NFSUNLOCKCACHE();
+				mtx_unlock(mutex);
 				nfsrvd_rephead(nd);
 				*(nd->nd_errp) = rp->rc_status;
 				ret = RC_REPLY;
@@ -362,7 +396,7 @@ loop:
 					NFSRVCACHE_UDPTIMEOUT;
 			} else if (rp->rc_flag & RC_REPMBUF) {
 				newnfsstats.srvcache_nonidemdonehits++;
-				NFSUNLOCKCACHE();
+				mtx_unlock(mutex);
 				nd->nd_mreq = m_copym(rp->rc_reply, 0,
 					M_COPYALL, M_WAITOK);
 				ret = RC_REPLY;
@@ -392,7 +426,7 @@ loop:
 	}
 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
 	TAILQ_INSERT_TAIL(&nfsrvudplru, newrp, rc_lru);
-	NFSUNLOCKCACHE();
+	mtx_unlock(mutex);
 	nd->nd_rp = newrp;
 	ret = RC_DOIT;
 
@@ -410,12 +444,16 @@ nfsrvd_updatecache(struct nfsrv_descript
 	struct nfsrvcache *rp;
 	struct nfsrvcache *retrp = NULL;
 	mbuf_t m;
+	struct mtx *mutex;
 
+	if (nfsrc_tcphighwater > nfsrc_floodlevel)
+		nfsrc_floodlevel = nfsrc_tcphighwater;
 	rp = nd->nd_rp;
 	if (!rp)
 		panic("nfsrvd_updatecache null rp");
 	nd->nd_rp = NULL;
-	NFSLOCKCACHE();
+	mutex = nfsrc_cachemutex(rp);
+	mtx_lock(mutex);
 	nfsrc_lock(rp);
 	if (!(rp->rc_flag & RC_INPROG))
 		panic("nfsrvd_updatecache not inprog");
@@ -430,7 +468,7 @@ nfsrvd_updatecache(struct nfsrv_descript
 	 */
 	if (nd->nd_repstat == NFSERR_REPLYFROMCACHE) {
 		newnfsstats.srvcache_nonidemdonehits++;
-		NFSUNLOCKCACHE();
+		mtx_unlock(mutex);
 		nd->nd_repstat = 0;
 		if (nd->nd_mreq)
 			mbuf_freem(nd->nd_mreq);
@@ -438,7 +476,7 @@ nfsrvd_updatecache(struct nfsrv_descript
 			panic("reply from cache");
 		nd->nd_mreq = m_copym(rp->rc_reply, 0,
 		    M_COPYALL, M_WAITOK);
-		rp->rc_timestamp = NFSD_MONOSEC + NFSRVCACHE_TCPTIMEOUT;
+		rp->rc_timestamp = NFSD_MONOSEC + nfsrc_tcptimeout;
 		nfsrc_unlock(rp);
 		goto out;
 	}
@@ -463,21 +501,21 @@ nfsrvd_updatecache(struct nfsrv_descript
 		    nfsv2_repstat[newnfsv2_procid[nd->nd_procnum]]) {
 			rp->rc_status = nd->nd_repstat;
 			rp->rc_flag |= RC_REPSTATUS;
-			NFSUNLOCKCACHE();
+			mtx_unlock(mutex);
 		} else {
 			if (!(rp->rc_flag & RC_UDP)) {
-			    nfsrc_tcpsavedreplies++;
+			    atomic_add_int(&nfsrc_tcpsavedreplies, 1);
 			    if (nfsrc_tcpsavedreplies >
 				newnfsstats.srvcache_tcppeak)
 				newnfsstats.srvcache_tcppeak =
 				    nfsrc_tcpsavedreplies;
 			}
-			NFSUNLOCKCACHE();
-			m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAITOK);
-			NFSLOCKCACHE();
+			mtx_unlock(mutex);
+			m = m_copym(nd->nd_mreq, 0, M_COPYALL, M_WAIT);
+			mtx_lock(mutex);
 			rp->rc_reply = m;
 			rp->rc_flag |= RC_REPMBUF;
-			NFSUNLOCKCACHE();
+			mtx_unlock(mutex);
 		}
 		if (rp->rc_flag & RC_UDP) {
 			rp->rc_timestamp = NFSD_MONOSEC +
@@ -485,7 +523,7 @@ nfsrvd_updatecache(struct nfsrv_descript
 			nfsrc_unlock(rp);
 		} else {
 			rp->rc_timestamp = NFSD_MONOSEC +
-			    NFSRVCACHE_TCPTIMEOUT;
+			    nfsrc_tcptimeout;
 			if (rp->rc_refcnt > 0)
 				nfsrc_unlock(rp);
 			else
@@ -493,7 +531,7 @@ nfsrvd_updatecache(struct nfsrv_descript
 		}
 	} else {
 		nfsrc_freecache(rp);
-		NFSUNLOCKCACHE();
+		mtx_unlock(mutex);
 	}
 
 out:
@@ -509,14 +547,16 @@ out:
 APPLESTATIC void
 nfsrvd_delcache(struct nfsrvcache *rp)
 {
+	struct mtx *mutex;
 
+	mutex = nfsrc_cachemutex(rp);
 	if (!(rp->rc_flag & RC_INPROG))
 		panic("nfsrvd_delcache not in prog");
-	NFSLOCKCACHE();
+	mtx_lock(mutex);
 	rp->rc_flag &= ~RC_INPROG;
 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & RC_LOCKED))
 		nfsrc_freecache(rp);
-	NFSUNLOCKCACHE();
+	mtx_unlock(mutex);
 }
 
 /*
@@ -528,7 +568,9 @@ APPLESTATIC void
 nfsrvd_sentcache(struct nfsrvcache *rp, struct socket *so, int err)
 {
 	tcp_seq tmp_seq;
+	struct mtx *mutex;
 
+	mutex = nfsrc_cachemutex(rp);
 	if (!(rp->rc_flag & RC_LOCKED))
 		panic("nfsrvd_sentcache not locked");
 	if (!err) {
@@ -537,10 +579,10 @@ nfsrvd_sentcache(struct nfsrvcache *rp, 
 		     so->so_proto->pr_protocol != IPPROTO_TCP)
 			panic("nfs sent cache");
 		if (nfsrv_getsockseqnum(so, &tmp_seq)) {
-			NFSLOCKCACHE();
+			mtx_lock(mutex);
 			rp->rc_tcpseq = tmp_seq;
 			rp->rc_flag |= RC_TCPSEQ;
-			NFSUNLOCKCACHE();
+			mtx_unlock(mutex);
 		}
 	}
 	nfsrc_unlock(rp);
@@ -559,11 +601,13 @@ nfsrc_gettcp(struct nfsrv_descript *nd, 
 	struct nfsrvcache *hitrp;
 	struct nfsrvhashhead *hp, nfsrc_templist;
 	int hit, ret = 0;
+	struct mtx *mutex;
 
+	mutex = nfsrc_cachemutex(newrp);
 	hp = NFSRCHASH(newrp->rc_xid);
 	newrp->rc_reqlen = nfsrc_getlenandcksum(nd->nd_mrep, &newrp->rc_cksum);
 tryagain:
-	NFSLOCKCACHE();
+	mtx_lock(mutex);
 	hit = 1;
 	LIST_INIT(&nfsrc_templist);
 	/*
@@ -621,8 +665,8 @@ tryagain:
 		rp = hitrp;
 		if ((rp->rc_flag & RC_LOCKED) != 0) {
 			rp->rc_flag |= RC_WANTED;
-			(void)mtx_sleep(rp, NFSCACHEMUTEXPTR,
-			    (PZERO - 1) | PDROP, "nfsrc", 10 * hz);
+			(void)mtx_sleep(rp, mutex, (PZERO - 1) | PDROP,
+			    "nfsrc", 10 * hz);
 			goto tryagain;
 		}
 		if (rp->rc_flag == 0)
@@ -630,7 +674,7 @@ tryagain:
 		rp->rc_flag |= RC_LOCKED;
 		if (rp->rc_flag & RC_INPROG) {
 			newnfsstats.srvcache_inproghits++;
-			NFSUNLOCKCACHE();
+			mtx_unlock(mutex);
 			if (newrp->rc_sockref == rp->rc_sockref)
 				nfsrc_marksametcpconn(rp->rc_sockref);
 			ret = RC_DROPIT;
@@ -639,24 +683,24 @@ tryagain:
 			 * V2 only.
 			 */
 			newnfsstats.srvcache_nonidemdonehits++;
-			NFSUNLOCKCACHE();
+			mtx_unlock(mutex);
 			if (newrp->rc_sockref == rp->rc_sockref)
 				nfsrc_marksametcpconn(rp->rc_sockref);
 			ret = RC_REPLY;
 			nfsrvd_rephead(nd);
 			*(nd->nd_errp) = rp->rc_status;
 			rp->rc_timestamp = NFSD_MONOSEC +
-				NFSRVCACHE_TCPTIMEOUT;
+				nfsrc_tcptimeout;
 		} else if (rp->rc_flag & RC_REPMBUF) {
 			newnfsstats.srvcache_nonidemdonehits++;
-			NFSUNLOCKCACHE();
+			mtx_unlock(mutex);
 			if (newrp->rc_sockref == rp->rc_sockref)
 				nfsrc_marksametcpconn(rp->rc_sockref);
 			ret = RC_REPLY;
 			nd->nd_mreq = m_copym(rp->rc_reply, 0,
 				M_COPYALL, M_WAITOK);
 			rp->rc_timestamp = NFSD_MONOSEC +
-				NFSRVCACHE_TCPTIMEOUT;
+				nfsrc_tcptimeout;
 		} else {
 			panic("nfs tcp cache1");
 		}
@@ -674,7 +718,7 @@ tryagain:
 	newrp->rc_cachetime = NFSD_MONOSEC;
 	newrp->rc_flag |= RC_INPROG;
 	LIST_INSERT_HEAD(hp, newrp, rc_hash);
-	NFSUNLOCKCACHE();
+	mtx_unlock(mutex);
 	nd->nd_rp = newrp;
 	ret = RC_DOIT;
 
@@ -685,16 +729,17 @@ out:
 
 /*
  * Lock a cache entry.
- * Also puts a mutex lock on the cache list.
  */
 static void
 nfsrc_lock(struct nfsrvcache *rp)
 {
-	NFSCACHELOCKREQUIRED();
+	struct mtx *mutex;
+
+	mutex = nfsrc_cachemutex(rp);
+	mtx_assert(mutex, MA_OWNED);
 	while ((rp->rc_flag & RC_LOCKED) != 0) {
 		rp->rc_flag |= RC_WANTED;
-		(void)mtx_sleep(rp, NFSCACHEMUTEXPTR, PZERO - 1,
-		    "nfsrc", 0);
+		(void)mtx_sleep(rp, mutex, PZERO - 1, "nfsrc", 0);
 	}
 	rp->rc_flag |= RC_LOCKED;
 }
@@ -705,11 +750,13 @@ nfsrc_lock(struct nfsrvcache *rp)
 static void
 nfsrc_unlock(struct nfsrvcache *rp)
 {
+	struct mtx *mutex;
 
-	NFSLOCKCACHE();
+	mutex = nfsrc_cachemutex(rp);
+	mtx_lock(mutex);
 	rp->rc_flag &= ~RC_LOCKED;
 	nfsrc_wanted(rp);
-	NFSUNLOCKCACHE();
+	mtx_unlock(mutex);
 }
 
 /*
@@ -732,7 +779,6 @@ static void
 nfsrc_freecache(struct nfsrvcache *rp)
 {
 
-	NFSCACHELOCKREQUIRED();
 	LIST_REMOVE(rp, rc_hash);
 	if (rp->rc_flag & RC_UDP) {
 		TAILQ_REMOVE(&nfsrvudplru, rp, rc_lru);
@@ -742,7 +788,7 @@ nfsrc_freecache(struct nfsrvcache *rp)
 	if (rp->rc_flag & RC_REPMBUF) {
 		mbuf_freem(rp->rc_reply);
 		if (!(rp->rc_flag & RC_UDP))
-			nfsrc_tcpsavedreplies--;
+			atomic_add_int(&nfsrc_tcpsavedreplies, -1);
 	}
 	FREE((caddr_t)rp, M_NFSRVCACHE);
 	newnfsstats.srvcache_size--;
@@ -757,20 +803,22 @@ nfsrvd_cleancache(void)
 	struct nfsrvcache *rp, *nextrp;
 	int i;
 
-	NFSLOCKCACHE();
 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
+		mtx_lock(&nfsrc_tcpmtx[i]);
 		LIST_FOREACH_SAFE(rp, &nfsrvhashtbl[i], rc_hash, nextrp) {
 			nfsrc_freecache(rp);
 		}
+		mtx_unlock(&nfsrc_tcpmtx[i]);
 	}
+	mtx_lock(&nfsrc_udpmtx);
 	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
 		LIST_FOREACH_SAFE(rp, &nfsrvudphashtbl[i], rc_hash, nextrp) {
 			nfsrc_freecache(rp);
 		}
 	}
 	newnfsstats.srvcache_size = 0;
+	mtx_unlock(&nfsrc_udpmtx);
 	nfsrc_tcpsavedreplies = 0;
-	NFSUNLOCKCACHE();
 }
 
 /*
@@ -780,28 +828,97 @@ static void
 nfsrc_trimcache(u_int64_t sockref, struct socket *so)
 {
 	struct nfsrvcache *rp, *nextrp;
-	int i;
+	int i, j, k, time_histo[10];
+	time_t thisstamp;
+	static time_t udp_lasttrim = 0, tcp_lasttrim = 0;
+	static int onethread = 0;
 
-	NFSLOCKCACHE();
-	TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
-		if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
-		     && rp->rc_refcnt == 0
-		     && ((rp->rc_flag & RC_REFCNT) ||
-			 NFSD_MONOSEC > rp->rc_timestamp ||
-			 nfsrc_udpcachesize > nfsrc_udphighwater))
-			nfsrc_freecache(rp);
-	}
-	for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
-		LIST_FOREACH_SAFE(rp, &nfsrvhashtbl[i], rc_hash, nextrp) {
+	if (atomic_cmpset_acq_int(&onethread, 0, 1) == 0)
+		return;
+	if (NFSD_MONOSEC != udp_lasttrim ||
+	    nfsrc_udpcachesize >= (nfsrc_udphighwater +
+	    nfsrc_udphighwater / 2)) {
+		mtx_lock(&nfsrc_udpmtx);
+		udp_lasttrim = NFSD_MONOSEC;
+		TAILQ_FOREACH_SAFE(rp, &nfsrvudplru, rc_lru, nextrp) {
 			if (!(rp->rc_flag & (RC_INPROG|RC_LOCKED|RC_WANTED))
 			     && rp->rc_refcnt == 0
 			     && ((rp->rc_flag & RC_REFCNT) ||
-				 NFSD_MONOSEC > rp->rc_timestamp ||
-				 nfsrc_activesocket(rp, sockref, so)))
+				 udp_lasttrim > rp->rc_timestamp ||
+				 nfsrc_udpcachesize > nfsrc_udphighwater))
 				nfsrc_freecache(rp);
 		}
+		mtx_unlock(&nfsrc_udpmtx);
+	}
+	if (NFSD_MONOSEC != tcp_lasttrim ||
+	    nfsrc_tcpsavedreplies >= nfsrc_tcphighwater) {
+		for (i = 0; i < 10; i++)
+			time_histo[i] = 0;
+		for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
+			mtx_lock(&nfsrc_tcpmtx[i]);
+			if (i == 0)
+				tcp_lasttrim = NFSD_MONOSEC;
+			LIST_FOREACH_SAFE(rp, &nfsrvhashtbl[i], rc_hash,
+			    nextrp) {
+				if (!(rp->rc_flag &
+				     (RC_INPROG|RC_LOCKED|RC_WANTED))
+				     && rp->rc_refcnt == 0) {
+					/*
+					 * The timestamps range from roughly the
+					 * present (tcp_lasttrim) to the present
+					 * + nfsrc_tcptimeout. Generate a simple
+					 * histogram of where the timeouts fall.
+					 */
+					j = rp->rc_timestamp - tcp_lasttrim;
+					if (j >= nfsrc_tcptimeout)
+						j = nfsrc_tcptimeout - 1;
+					if (j < 0)
+						j = 0;
+					j = (j * 10 / nfsrc_tcptimeout) % 10;
+					time_histo[j]++;
+					if ((rp->rc_flag & RC_REFCNT) ||
+					    tcp_lasttrim > rp->rc_timestamp ||
+					    nfsrc_activesocket(rp, sockref, so))
+						nfsrc_freecache(rp);
+				}
+			}
+			mtx_unlock(&nfsrc_tcpmtx[i]);
+		}
+		j = nfsrc_tcphighwater / 5;	/* 20% of it */
+		if (j > 0 && (nfsrc_tcpsavedreplies + j) > nfsrc_tcphighwater) {
+			/*
+			 * Trim some more with a smaller timeout of as little
+			 * as 20% of nfsrc_tcptimeout to try and get below
+			 * 80% of the nfsrc_tcphighwater.
+			 */
+			k = 0;
+			for (i = 0; i < 8; i++) {
+				k += time_histo[i];
+				if (k > j)
+					break;
+			}
+			k = nfsrc_tcptimeout * (i + 1) / 10;
+			if (k < 1)
+				k = 1;
+			thisstamp = tcp_lasttrim + k;
+			for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
+				mtx_lock(&nfsrc_tcpmtx[i]);
+				LIST_FOREACH_SAFE(rp, &nfsrvhashtbl[i], rc_hash,
+				    nextrp) {
+					if (!(rp->rc_flag &
+					     (RC_INPROG|RC_LOCKED|RC_WANTED))
+					     && rp->rc_refcnt == 0
+					     && ((rp->rc_flag & RC_REFCNT) ||
+						 thisstamp > rp->rc_timestamp ||
+						 nfsrc_activesocket(rp, sockref,
+						    so)))
+						nfsrc_freecache(rp);
+				}
+				mtx_unlock(&nfsrc_tcpmtx[i]);
+			}
+		}
 	}
-	NFSUNLOCKCACHE();
+	atomic_store_rel_int(&onethread, 0);
 }
 
 /*
@@ -810,12 +927,14 @@ nfsrc_trimcache(u_int64_t sockref, struc
 APPLESTATIC void
 nfsrvd_refcache(struct nfsrvcache *rp)
 {
+	struct mtx *mutex;
 
-	NFSLOCKCACHE();
+	mutex = nfsrc_cachemutex(rp);
+	mtx_lock(mutex);
 	if (rp->rc_refcnt < 0)
 		panic("nfs cache refcnt");
 	rp->rc_refcnt++;
-	NFSUNLOCKCACHE();
+	mtx_unlock(mutex);
 }
 
 /*
@@ -824,14 +943,16 @@ nfsrvd_refcache(struct nfsrvcache *rp)
 APPLESTATIC void
 nfsrvd_derefcache(struct nfsrvcache *rp)
 {
+	struct mtx *mutex;
 
-	NFSLOCKCACHE();
+	mutex = nfsrc_cachemutex(rp);
+	mtx_lock(mutex);
 	if (rp->rc_refcnt <= 0)
 		panic("nfs cache derefcnt");
 	rp->rc_refcnt--;
 	if (rp->rc_refcnt == 0 && !(rp->rc_flag & (RC_LOCKED | RC_INPROG)))
 		nfsrc_freecache(rp);
-	NFSUNLOCKCACHE();
+	mtx_unlock(mutex);
 }
 
 /*
--- fs/nfsserver/nfs_nfsdport.c.orig	2013-03-02 18:19:34.000000000 -0500
+++ fs/nfsserver/nfs_nfsdport.c	2013-03-12 17:51:31.000000000 -0400
@@ -61,7 +61,8 @@ extern struct nfsv4lock nfsd_suspend_loc
 extern struct nfssessionhash nfssessionhash[NFSSESSIONHASHSIZE];
 struct vfsoptlist nfsv4root_opt, nfsv4root_newopt;
 NFSDLOCKMUTEX;
-struct mtx nfs_cache_mutex;
+struct mtx nfsrc_tcpmtx[NFSRVCACHE_HASHSIZE];
+struct mtx nfsrc_udpmtx;
 struct mtx nfs_v4root_mutex;
 struct nfsrvfh nfs_rootfh, nfs_pubfh;
 int nfs_pubfhset = 0, nfs_rootfhset = 0;
@@ -3305,7 +3306,10 @@ nfsd_modevent(module_t mod, int type, vo
 		if (loaded)
 			goto out;
 		newnfs_portinit();
-		mtx_init(&nfs_cache_mutex, "nfs_cache_mutex", NULL, MTX_DEF);
+		for (i = 0; i < NFSRVCACHE_HASHSIZE; i++)
+			mtx_init(&nfsrc_tcpmtx[i], "nfs_tcpcache_mutex", NULL,
+			    MTX_DEF);
+		mtx_init(&nfsrc_udpmtx, "nfs_udpcache_mutex", NULL, MTX_DEF);
 		mtx_init(&nfs_v4root_mutex, "nfs_v4root_mutex", NULL, MTX_DEF);
 		mtx_init(&nfsv4root_mnt.mnt_mtx, "struct mount mtx", NULL,
 		    MTX_DEF);
@@ -3352,7 +3356,9 @@ nfsd_modevent(module_t mod, int type, vo
 			svcpool_destroy(nfsrvd_pool);
 
 		/* and get rid of the locks */
-		mtx_destroy(&nfs_cache_mutex);
+		for (i = 0; i < NFSRVCACHE_HASHSIZE; i++)
+			mtx_destroy(&nfsrc_tcpmtx[i]);
+		mtx_destroy(&nfsrc_udpmtx);
 		mtx_destroy(&nfs_v4root_mutex);
 		mtx_destroy(&nfsv4root_mnt.mnt_mtx);
 		for (i = 0; i < NFSSESSIONHASHSIZE; i++)
--- fs/nfs/nfsport.h.orig	2013-03-02 18:35:13.000000000 -0500
+++ fs/nfs/nfsport.h	2013-03-12 17:51:31.000000000 -0400
@@ -609,11 +609,6 @@ void nfsrvd_rcv(struct socket *, void *,
 #define	NFSREQSPINLOCK		extern struct mtx nfs_req_mutex
 #define	NFSLOCKREQ()		mtx_lock(&nfs_req_mutex)
 #define	NFSUNLOCKREQ()		mtx_unlock(&nfs_req_mutex)
-#define	NFSCACHEMUTEX		extern struct mtx nfs_cache_mutex
-#define	NFSCACHEMUTEXPTR	(&nfs_cache_mutex)
-#define	NFSLOCKCACHE()		mtx_lock(&nfs_cache_mutex)
-#define	NFSUNLOCKCACHE()	mtx_unlock(&nfs_cache_mutex)
-#define	NFSCACHELOCKREQUIRED()	mtx_assert(&nfs_cache_mutex, MA_OWNED)
 #define	NFSSOCKMUTEX		extern struct mtx nfs_slock_mutex
 #define	NFSSOCKMUTEXPTR		(&nfs_slock_mutex)
 #define	NFSLOCKSOCK()		mtx_lock(&nfs_slock_mutex)
--- fs/nfs/nfsrvcache.h.orig	2013-01-07 09:04:15.000000000 -0500
+++ fs/nfs/nfsrvcache.h	2013-03-12 18:02:42.000000000 -0400
@@ -41,7 +41,7 @@
 #define	NFSRVCACHE_MAX_SIZE	2048
 #define	NFSRVCACHE_MIN_SIZE	  64
 
-#define	NFSRVCACHE_HASHSIZE	20
+#define	NFSRVCACHE_HASHSIZE	500
 
 struct nfsrvcache {
 	LIST_ENTRY(nfsrvcache) rc_hash;		/* Hash chain */

Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?312742115.4078123.1363729759052.JavaMail.root>