Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 27 Nov 2012 21:19:59 +0000 (UTC)
From:      Andre Oppermann <andre@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r243631 - in head/sys: kern sys
Message-ID:  <201211272119.qARLJxXV061083@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: andre
Date: Tue Nov 27 21:19:58 2012
New Revision: 243631
URL: http://svnweb.freebsd.org/changeset/base/243631

Log:
  Base the mbuf related limits on the available physical memory or
  kernel memory, whichever is lower.  The overall mbuf related memory
  limit must be set so that mbufs (and clusters of various sizes)
  can't exhaust physical RAM or KVM.
  
  The limit is set to half of the physical RAM or KVM (whichever is
  lower) as the baseline.  In any normal scenario we want to leave
  at least half of the physmem/kvm for other kernel functions and
  userspace to prevent it from swapping too easily.  Via a tunable
  kern.maxmbufmem the limit can be upped to at most 3/4 of physmem/kvm.
  
  At the same time divorce maxfiles from maxusers and set maxfiles to
  physpages / 8 with a floor based on maxusers.  This way busy servers
  can make use of the significantly increased mbuf limits with a much
  larger number of open sockets.
  
  Tidy up ordering in init_param2() and check up on some users of
  those values calculated here.
  
  Out of the overall mbuf memory limit 2K clusters and 4K (page size)
  clusters to get 1/4 each because these are the most heavily used mbuf
  sizes.  2K clusters are used for MTU 1500 ethernet inbound packets.
  4K clusters are used whenever possible for sends on sockets and thus
  outbound packets.  The larger cluster sizes of 9K and 16K are limited
  to 1/6 of the overall mbuf memory limit.  When jumbo MTU's are used
  these large clusters will end up only on the inbound path.  They are
  not used on outbound, there it's still 4K.  Yes, that will stay that
  way because otherwise we run into lots of complications in the
  stack.  And it really isn't a problem, so don't make a scene.
  
  Normal mbufs (256B) weren't limited at all previously.  This was
  problematic as there are certain places in the kernel that on
  allocation failure of clusters try to piece together their packet
  from smaller mbufs.
  
  The mbuf limit is the number of all other mbuf sizes together plus
  some more to allow for standalone mbufs (ACK for example) and to
  send off a copy of a cluster.  Unfortunately there isn't a way to
  set an overall limit for all mbuf memory together as UMA doesn't
  support such a limiting.
  
  NB: Every cluster also has an mbuf associated with it.
  
  Two examples on the revised mbuf sizing limits:
  
  1GB KVM:
   512MB limit for mbufs
   419,430 mbufs
    65,536 2K mbuf clusters
    32,768 4K mbuf clusters
     9,709 9K mbuf clusters
     5,461 16K mbuf clusters
  
  16GB RAM:
   8GB limit for mbufs
   33,554,432 mbufs
    1,048,576 2K mbuf clusters
      524,288 4K mbuf clusters
      155,344 9K mbuf clusters
       87,381 16K mbuf clusters
  
  These defaults should be sufficient for even the most demanding
  network loads.
  
  MFC after:	1 month

Modified:
  head/sys/kern/kern_mbuf.c
  head/sys/kern/subr_param.c
  head/sys/kern/uipc_socket.c
  head/sys/sys/eventhandler.h
  head/sys/sys/mbuf.h

Modified: head/sys/kern/kern_mbuf.c
==============================================================================
--- head/sys/kern/kern_mbuf.c	Tue Nov 27 20:22:36 2012	(r243630)
+++ head/sys/kern/kern_mbuf.c	Tue Nov 27 21:19:58 2012	(r243631)
@@ -96,6 +96,7 @@ __FBSDID("$FreeBSD$");
  *
  */
 
+int nmbufs;			/* limits number of mbufs */
 int nmbclusters;		/* limits number of mbuf clusters */
 int nmbjumbop;			/* limits number of page size jumbo clusters */
 int nmbjumbo9;			/* limits number of 9k jumbo clusters */
@@ -147,9 +148,11 @@ sysctl_nmbclusters(SYSCTL_HANDLER_ARGS)
 	newnmbclusters = nmbclusters;
 	error = sysctl_handle_int(oidp, &newnmbclusters, 0, req); 
 	if (error == 0 && req->newptr) {
-		if (newnmbclusters > nmbclusters) {
+		if (newnmbclusters > nmbclusters &&
+		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbclusters = newnmbclusters;
 			uma_zone_set_max(zone_clust, nmbclusters);
+			nmbclusters = uma_zone_get_max(zone_clust);
 			EVENTHANDLER_INVOKE(nmbclusters_change);
 		} else
 			error = EINVAL;
@@ -168,9 +171,11 @@ sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS)
 	newnmbjumbop = nmbjumbop;
 	error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req); 
 	if (error == 0 && req->newptr) {
-		if (newnmbjumbop> nmbjumbop) {
+		if (newnmbjumbop > nmbjumbop &&
+		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbjumbop = newnmbjumbop;
 			uma_zone_set_max(zone_jumbop, nmbjumbop);
+			nmbjumbop = uma_zone_get_max(zone_jumbop);
 		} else
 			error = EINVAL;
 	}
@@ -189,9 +194,11 @@ sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS)
 	newnmbjumbo9 = nmbjumbo9;
 	error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req); 
 	if (error == 0 && req->newptr) {
-		if (newnmbjumbo9> nmbjumbo9) {
+		if (newnmbjumbo9 > nmbjumbo9&&
+		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbjumbo9 = newnmbjumbo9;
 			uma_zone_set_max(zone_jumbo9, nmbjumbo9);
+			nmbjumbo9 = uma_zone_get_max(zone_jumbo9);
 		} else
 			error = EINVAL;
 	}
@@ -209,9 +216,11 @@ sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS)
 	newnmbjumbo16 = nmbjumbo16;
 	error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req); 
 	if (error == 0 && req->newptr) {
-		if (newnmbjumbo16> nmbjumbo16) {
+		if (newnmbjumbo16 > nmbjumbo16 &&
+		    nmbufs >= nmbclusters + nmbjumbop + nmbjumbo9 + nmbjumbo16) {
 			nmbjumbo16 = newnmbjumbo16;
 			uma_zone_set_max(zone_jumbo16, nmbjumbo16);
+			nmbjumbo16 = uma_zone_get_max(zone_jumbo16);
 		} else
 			error = EINVAL;
 	}
@@ -221,6 +230,27 @@ SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumb
 &nmbjumbo16, 0, sysctl_nmbjumbo16, "IU",
     "Maximum number of mbuf 16k jumbo clusters allowed");
 
+static int
+sysctl_nmbufs(SYSCTL_HANDLER_ARGS)
+{
+	int error, newnmbufs;
+
+	newnmbufs = nmbufs;
+	error = sysctl_handle_int(oidp, &newnmbufs, 0, req); 
+	if (error == 0 && req->newptr) {
+		if (newnmbufs > nmbufs) {
+			nmbufs = newnmbufs;
+			uma_zone_set_max(zone_mbuf, nmbufs);
+			nmbclusters = uma_zone_get_max(zone_mbuf);
+			EVENTHANDLER_INVOKE(nmbufs_change);
+		} else
+			error = EINVAL;
+	}
+	return (error);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbuf, CTLTYPE_INT|CTLFLAG_RW,
+&nmbufs, 0, sysctl_nmbufs, "IU",
+    "Maximum number of mbufs allowed");
 
 
 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
@@ -275,6 +305,10 @@ mbuf_init(void *dummy)
 	    NULL, NULL,
 #endif
 	    MSIZE - 1, UMA_ZONE_MAXBUCKET);
+	if (nmbufs > 0) {
+		uma_zone_set_max(zone_mbuf, nmbufs);
+		nmbufs = uma_zone_get_max(zone_mbuf);
+	}
 
 	zone_clust = uma_zcreate(MBUF_CLUSTER_MEM_NAME, MCLBYTES,
 	    mb_ctor_clust, mb_dtor_clust,
@@ -284,8 +318,10 @@ mbuf_init(void *dummy)
 	    NULL, NULL,
 #endif
 	    UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
-	if (nmbclusters > 0)
+	if (nmbclusters > 0) {
 		uma_zone_set_max(zone_clust, nmbclusters);
+		nmbclusters = uma_zone_get_max(zone_clust);
+	}
 
 	zone_pack = uma_zsecond_create(MBUF_PACKET_MEM_NAME, mb_ctor_pack,
 	    mb_dtor_pack, mb_zinit_pack, mb_zfini_pack, zone_mbuf);
@@ -299,8 +335,10 @@ mbuf_init(void *dummy)
 	    NULL, NULL,
 #endif
 	    UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
-	if (nmbjumbop > 0)
+	if (nmbjumbop > 0) {
 		uma_zone_set_max(zone_jumbop, nmbjumbop);
+		nmbjumbop = uma_zone_get_max(zone_jumbop);
+	}
 
 	zone_jumbo9 = uma_zcreate(MBUF_JUMBO9_MEM_NAME, MJUM9BYTES,
 	    mb_ctor_clust, mb_dtor_clust,
@@ -310,9 +348,11 @@ mbuf_init(void *dummy)
 	    NULL, NULL,
 #endif
 	    UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
-	if (nmbjumbo9 > 0)
-		uma_zone_set_max(zone_jumbo9, nmbjumbo9);
 	uma_zone_set_allocf(zone_jumbo9, mbuf_jumbo_alloc);
+	if (nmbjumbo9 > 0) {
+		uma_zone_set_max(zone_jumbo9, nmbjumbo9);
+		nmbjumbo9 = uma_zone_get_max(zone_jumbo9);
+	}
 
 	zone_jumbo16 = uma_zcreate(MBUF_JUMBO16_MEM_NAME, MJUM16BYTES,
 	    mb_ctor_clust, mb_dtor_clust,
@@ -322,9 +362,11 @@ mbuf_init(void *dummy)
 	    NULL, NULL,
 #endif
 	    UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
-	if (nmbjumbo16 > 0)
-		uma_zone_set_max(zone_jumbo16, nmbjumbo16);
 	uma_zone_set_allocf(zone_jumbo16, mbuf_jumbo_alloc);
+	if (nmbjumbo16 > 0) {
+		uma_zone_set_max(zone_jumbo16, nmbjumbo16);
+		nmbjumbo16 = uma_zone_get_max(zone_jumbo16);
+	}
 
 	zone_ext_refcnt = uma_zcreate(MBUF_EXTREFCNT_MEM_NAME, sizeof(u_int),
 	    NULL, NULL,

Modified: head/sys/kern/subr_param.c
==============================================================================
--- head/sys/kern/subr_param.c	Tue Nov 27 20:22:36 2012	(r243630)
+++ head/sys/kern/subr_param.c	Tue Nov 27 21:19:58 2012	(r243631)
@@ -93,6 +93,7 @@ int	ncallout;			/* maximum # of timer ev
 int	nbuf;
 int	ngroups_max;			/* max # groups per process */
 int	nswbuf;
+long	maxmbufmem;			/* max mbuf memory */
 pid_t	pid_max = PID_MAX;
 long	maxswzone;			/* max swmeta KVA storage */
 long	maxbcache;			/* max buffer cache KVA storage */
@@ -270,6 +271,7 @@ init_param1(void)
 void
 init_param2(long physpages)
 {
+	long realmem;
 
 	/* Base parameters */
 	maxusers = MAXUSERS;
@@ -293,19 +295,25 @@ init_param2(long physpages)
 	/*
 	 * The following can be overridden after boot via sysctl.  Note:
 	 * unless overriden, these macros are ultimately based on maxusers.
-	 */
-	maxproc = NPROC;
-	TUNABLE_INT_FETCH("kern.maxproc", &maxproc);
-	/*
 	 * Limit maxproc so that kmap entries cannot be exhausted by
 	 * processes.
 	 */
+	maxproc = NPROC;
+	TUNABLE_INT_FETCH("kern.maxproc", &maxproc);
 	if (maxproc > (physpages / 12))
 		maxproc = physpages / 12;
-	maxfiles = MAXFILES;
-	TUNABLE_INT_FETCH("kern.maxfiles", &maxfiles);
 	maxprocperuid = (maxproc * 9) / 10;
-	maxfilesperproc = (maxfiles * 9) / 10;
+
+	/*
+	 * The default limit for maxfiles is 1/12 of the number of
+	 * physical page but not less than 16 times maxusers.
+	 * At most it can be 1/6 the number of physical pages.
+	 */
+	maxfiles = imax(MAXFILES, physpages / 8);
+	TUNABLE_INT_FETCH("kern.maxfiles", &maxfiles);
+	if (maxfiles > (physpages / 4))
+		maxfiles = physpages / 4;
+	maxfilesperproc = (maxfiles / 10) * 9;
 	
 	/*
 	 * Cannot be changed after boot.
@@ -313,20 +321,35 @@ init_param2(long physpages)
 	nbuf = NBUF;
 	TUNABLE_INT_FETCH("kern.nbuf", &nbuf);
 
+	/*
+	 * XXX: Does the callout wheel have to be so big?
+	 */
 	ncallout = 16 + maxproc + maxfiles;
 	TUNABLE_INT_FETCH("kern.ncallout", &ncallout);
 
 	/*
+	 * The default limit for all mbuf related memory is 1/2 of all
+	 * available kernel memory (physical or kmem).
+	 * At most it can be 3/4 of available kernel memory.
+	 */
+	realmem = lmin(physpages * PAGE_SIZE,
+			VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS);
+	maxmbufmem = realmem / 2;
+	TUNABLE_LONG_FETCH("kern.maxmbufmem", &maxmbufmem);
+	if (maxmbufmem > (realmem / 4) * 3)
+		maxmbufmem = (realmem / 4) * 3;
+
+	/*
 	 * The default for maxpipekva is min(1/64 of the kernel address space,
 	 * max(1/64 of main memory, 512KB)).  See sys_pipe.c for more details.
 	 */
 	maxpipekva = (physpages / 64) * PAGE_SIZE;
+	TUNABLE_LONG_FETCH("kern.ipc.maxpipekva", &maxpipekva);
 	if (maxpipekva < 512 * 1024)
 		maxpipekva = 512 * 1024;
 	if (maxpipekva > (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 64)
 		maxpipekva = (VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) /
 		    64;
-	TUNABLE_LONG_FETCH("kern.ipc.maxpipekva", &maxpipekva);
 }
 
 /*

Modified: head/sys/kern/uipc_socket.c
==============================================================================
--- head/sys/kern/uipc_socket.c	Tue Nov 27 20:22:36 2012	(r243630)
+++ head/sys/kern/uipc_socket.c	Tue Nov 27 21:19:58 2012	(r243631)
@@ -290,7 +290,7 @@ init_maxsockets(void *ignored)
 {
 
 	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
-	maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
+	maxsockets = imax(maxsockets, maxfiles);
 }
 SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
 
@@ -306,12 +306,9 @@ sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
 	newmaxsockets = maxsockets;
 	error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
 	if (error == 0 && req->newptr) {
-		if (newmaxsockets > maxsockets) {
+		if (newmaxsockets > maxsockets &&
+		    newmaxsockets <= maxfiles) {
 			maxsockets = newmaxsockets;
-			if (maxsockets > ((maxfiles / 4) * 3)) {
-				maxfiles = (maxsockets * 5) / 4;
-				maxfilesperproc = (maxfiles * 9) / 10;
-			}
 			EVENTHANDLER_INVOKE(maxsockets_change);
 		} else
 			error = EINVAL;

Modified: head/sys/sys/eventhandler.h
==============================================================================
--- head/sys/sys/eventhandler.h	Tue Nov 27 20:22:36 2012	(r243630)
+++ head/sys/sys/eventhandler.h	Tue Nov 27 21:19:58 2012	(r243631)
@@ -253,6 +253,7 @@ EVENTHANDLER_DECLARE(thread_fini, thread
 
 typedef void (*uma_zone_chfn)(void *);
 EVENTHANDLER_DECLARE(nmbclusters_change, uma_zone_chfn);
+EVENTHANDLER_DECLARE(nmbufs_change, uma_zone_chfn);
 EVENTHANDLER_DECLARE(maxsockets_change, uma_zone_chfn);
 
 #endif /* SYS_EVENTHANDLER_H */

Modified: head/sys/sys/mbuf.h
==============================================================================
--- head/sys/sys/mbuf.h	Tue Nov 27 20:22:36 2012	(r243630)
+++ head/sys/sys/mbuf.h	Tue Nov 27 21:19:58 2012	(r243631)
@@ -395,7 +395,7 @@ struct mbstat {
  *
  * The rest of it is defined in kern/kern_mbuf.c
  */
-
+extern long		maxmbufmem;
 extern uma_zone_t	zone_mbuf;
 extern uma_zone_t	zone_clust;
 extern uma_zone_t	zone_pack;



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201211272119.qARLJxXV061083>