Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 9 Aug 2002 15:16:27 -0400
From:      Bosko Milekic <bmilekic@unixdaemons.com>
To:        gallatin@freebsd.org, ken@freebsd.org
Cc:        freebsd-net@freebsd.org
Subject:   Jumbo Clusters in mb_alloc
Message-ID:  <20020809151627.A88180@unixdaemons.com>

next in thread | raw e-mail | index | archive | help

--X1bOJ3K7DJ5YkBrT
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline


Hi guys,

    Can you put this (the attached) to use?

    It's the implementation of jumbo clusters within mb_alloc, as
    discussed on -net not too long ago.

    I have not written the allocation interface yet as I would like to
    know if you could put them to use in some of the drivers, and also
    what you would need from the interface.

    On a parallel note, I've been thinking about [not now, but
    eventually] moving the virtual address "sf_buf" allocations to
    mb_alloc.  I have an idea of how to do this easily but have not
    bothered yet because -CURRENT is still largely under Giant and I
    cannot accurately measure potential performance benefits yet.  I've
    been thinking the same about the uipc_jumbo stuff, because they look
    a lot like sf_bufs.  The trick would be to set map_starved for their
    address space map in mb_alloc to 1 and then mb_alloc would think
    that the map is "starved" and never try to kmem_malloc() from it (in
    fact, it doesn't even have to be a "map," it just has to be some
    sort of pre-allocated space, for example, with
    kmem_alloc_pageable()).  Then, mb_alloc would populate the per-CPU
    caches and the general cache with the virtual address PAGE_SIZE
    "buffers" (they have no ptes in pmap so they're not really "buffers"
    as they don't have physical memory associated with them at this
    point).  Anyway, I think this would work but am delaying it until
    a while after 5.0, when I can really measure stuff on SMP.

    Anyway, see the attached patch and let me know if this is useful to
    you.  Thanks!

Cheers,
-- 
Bosko Milekic * bmilekic@unixdaemons.com * bmilekic@FreeBSD.org


--X1bOJ3K7DJ5YkBrT
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="jumbo.diff"

Index: src/sys/conf/NOTES
===================================================================
RCS file: /home/ncvs/src/sys/conf/NOTES,v
retrieving revision 1.1061
diff -u -r1.1061 NOTES
--- src/sys/conf/NOTES	9 Aug 2002 15:30:47 -0000	1.1061
+++ src/sys/conf/NOTES	9 Aug 2002 19:05:39 -0000
@@ -2252,6 +2252,7 @@
 options 	NBUF=512	# Number of buffer headers
 
 options 	NMBCLUSTERS=1024	# Number of mbuf clusters
+options		NMBJUMBOBUFS=128	# Number of jumbo clusters
 
 options 	SCSI_NCR_DEBUG
 options 	SCSI_NCR_MAX_SYNC=10000
Index: src/sys/conf/options
===================================================================
RCS file: /home/ncvs/src/sys/conf/options,v
retrieving revision 1.341
diff -u -r1.341 options
--- src/sys/conf/options	3 Aug 2002 00:19:58 -0000	1.341
+++ src/sys/conf/options	9 Aug 2002 19:05:45 -0000
@@ -197,6 +197,7 @@
 MAXFILES	opt_param.h
 NBUF		opt_param.h
 NMBCLUSTERS	opt_param.h
+NMBJUMBOBUFS	opt_param.h
 NSFBUFS		opt_param.h
 VM_BCACHE_SIZE_MAX	opt_param.h
 VM_SWZONE_SIZE_MAX	opt_param.h
Index: src/sys/kern/kern_malloc.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/kern_malloc.c,v
retrieving revision 1.111
diff -u -r1.111 kern_malloc.c
--- src/sys/kern/kern_malloc.c	31 May 2002 09:41:09 -0000	1.111
+++ src/sys/kern/kern_malloc.c	9 Aug 2002 19:05:52 -0000
@@ -335,6 +335,7 @@
 	u_long mem_size;
 	void *hashmem;
 	u_long hashsize;
+	u_int mb_size;
 	int highbit;
 	int bits;
 	int i;
@@ -385,9 +386,12 @@
 	 * amount to slightly more address space than we need for the submaps,
 	 * but it never hurts to have an extra page in kmem_map.
 	 */
-	npg = (nmbufs * MSIZE + nmbclusters * MCLBYTES + nmbcnt *
-	    sizeof(u_int) + vm_kmem_size) / PAGE_SIZE;
-
+	mb_size = nmbufs * MSIZE + nmbclusters * MCLBYTES + nmbcnt *
+	    sizeof(u_int);
+#ifdef NMBJUMBOBUFS
+	mb_size += nmbjumbobufs * MJUMBOSIZE;
+#endif
+	npg = (mb_size + vm_kmem_size) / PAGE_SIZE; 
 	kmem_map = kmem_suballoc(kernel_map, (vm_offset_t *)&kmembase,
 		(vm_offset_t *)&kmemlimit, (vm_size_t)(npg * PAGE_SIZE));
 	kmem_map->system_map = 1;
Index: src/sys/kern/subr_mbuf.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/subr_mbuf.c,v
retrieving revision 1.29
diff -u -r1.29 subr_mbuf.c
--- src/sys/kern/subr_mbuf.c	8 Aug 2002 13:31:57 -0000	1.29
+++ src/sys/kern/subr_mbuf.c	9 Aug 2002 19:05:59 -0000
@@ -151,6 +151,9 @@
 int	nmbclusters;
 int	nmbcnt;
 int	nsfbufs;
+#ifdef NMBJUMBOBUFS
+int	nmbjumbobufs;
+#endif
 
 /*
  * Perform sanity checks of tunables declared above.
@@ -170,6 +173,10 @@
 	TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
 	nmbcnt = NMBCNTS;
 	TUNABLE_INT_FETCH("kern.ipc.nmbcnt", &nmbcnt);
+#ifdef NMBJUMBOBUFS
+	nmbjumbobufs = NMBJUMBOBUFS;
+	TUNABLE_INT_FETCH("kern.ipc.nmbjumbobufs", &nmbjumbobufs);
+#endif
 	/* Sanity checks */
 	if (nmbufs < nmbclusters * 2)
 		nmbufs = nmbclusters * 2;
@@ -197,11 +204,15 @@
 	vm_offset_t	ml_maptop;
 	int		ml_mapfull;
 	u_int		ml_objsize;
+	u_int		ml_bucksize;
 	u_int		*ml_wmhigh;
 };
 static struct mb_lstmngr mb_list_mbuf, mb_list_clust;
 static struct mtx mbuf_gen, mbuf_pcpu[NCPU];
 u_int *cl_refcntmap;
+#ifdef NMBJUMBOBUFS
+static struct mb_lstmngr mb_list_jumbo;
+#endif
 
 /*
  * Local macros for internal allocator structure manipulations.
@@ -221,8 +232,8 @@
 #define	MB_GET_PCPU_LIST_NUM(mb_lst, num)				\
     (mb_lst)->ml_cntlst[(num)]
 
-#define	MB_BUCKET_INDX(mb_obj, mb_lst)					\
-    (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) / PAGE_SIZE)
+#define	MB_BUCKET_INDX(mb_obj, mb_lst, mb_div)				\
+    (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) / (mb_div))
 
 #define	MB_GET_OBJECT(mb_objp, mb_bckt, mb_lst)				\
 {									\
@@ -271,6 +282,9 @@
 
 static u_int mbuf_limit = 512;	/* Upper limit on # of mbufs per CPU. */
 static u_int clust_limit = 128;	/* Upper limit on # of clusters per CPU. */
+#ifdef NMBJUMBOBUFS
+static u_int jumbo_limit = 32; /* Upper limit on # of jumboclusts per CPU. */
+#endif
 
 /*
  * Objects exported by sysctl(8).
@@ -294,6 +308,12 @@
     "Mbuf general information and statistics");
 SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mb_statpcpu, CTLFLAG_RD, mb_statpcpu,
     sizeof(mb_statpcpu), "S,", "Mbuf allocator per CPU statistics");
+#ifdef NMBJUMBOBUFS
+SYSCTL_INT(_kern_ipc, OID_AUTO, nmbjumbobufs, CTLFLAG_RD, &nmbjumbobufs, 0,
+    "Maximum number of jumbo clusters available");
+SYSCTL_UINT(_kern_ipc, OID_AUTO, jumbo_limit, CTLFLAG_RW, &jumbo_limit, 0,
+    "Upper limit on number of jumbo clusters allowed on each PCPU list");
+#endif
 
 /*
  * Prototypes of local allocator routines.
@@ -311,6 +331,32 @@
  */
 #define	NMB_MBUF_INIT	4
 #define	NMB_CLUST_INIT	16
+#ifdef NMBJUMBOBUFS
+#define NMB_JUMBO_INIT	1
+
+/*
+ * Do not change this unless you know EXACTLY what you're doing.  This is
+ * the pre-calculated number of pages of jumbo clusters to allocate per
+ * "bucket."  Here's how it works:
+ *
+ * - MJUMBOSIZE is a constant, and we picked it to be 9216 bytes. This should
+ * be enough to accomodate large 9K frames, and a reference counter for them.
+ * - 'n' is the number of jumbo clusters per bucket.
+ *
+ * For minimum space wastage to occur, we need:
+ *  (MJUMBOSIZE * n) % PAGE_SIZE == 0.
+ * We want to pick the smallest possible 'n' so that our buckets don't span
+ * too much space.  For smallest PAGE_SIZE of 4K (like on i386, for example),
+ * 'n' is 4, and this means that we will need:
+ * (MJUMBOSIZE * 4 / PAGE_SIZE) = JMB_PG_BUCKET = 9 pages per bucket.
+ * This corresponds to the smallest 'n' that we can find for a 4K page size.
+ * For a larger page size (e.g., 8K), we just have PAGE_SIZE = 2 * 4096,
+ * so n = 2 * 4 = 8.  We can calculate 'n' at runtime for our page size,
+ * as long as PAGE_SIZE is a multiple of 2, and as long as we define
+ * JMB_PG_BUCKET here appropriately.
+ */
+#define	JMB_PG_BUCKET	9
+#endif
 
 /*
  * Internal flags that allow for cache locks to remain "persistent" across
@@ -357,6 +403,7 @@
 	mb_list_mbuf.ml_mapfull = 0;
 	mb_list_mbuf.ml_objsize = MSIZE;
 	mb_list_mbuf.ml_wmhigh = &mbuf_limit;
+	mb_list_mbuf.ml_bucksize = PAGE_SIZE;
 
 	mb_map_size = (vm_size_t)(nmbclusters * MCLBYTES);
 	mb_map_size = rounddown(mb_map_size, PAGE_SIZE);
@@ -371,6 +418,7 @@
 	mb_list_clust.ml_mapfull = 0;
 	mb_list_clust.ml_objsize = MCLBYTES;
 	mb_list_clust.ml_wmhigh = &clust_limit;
+	mb_list_clust.ml_bucksize = PAGE_SIZE;
 
 	/*
 	 * Allocate required general (global) containers for each object type.
@@ -428,11 +476,45 @@
 	 */
 	mbstat.m_msize = MSIZE;
 	mbstat.m_mclbytes = MCLBYTES;
+	mbstat.m_mjumbobytes = MJUMBOBYTES;
 	mbstat.m_minclsize = MINCLSIZE;
 	mbstat.m_mlen = MLEN;
 	mbstat.m_mhlen = MHLEN;
 	mbstat.m_numtypes = MT_NTYPES;
 
+#ifdef NMBJUMBOBUFS
+	mb_map_size = (vm_size_t)(nmbjumbobufs * MJUMBOSIZE);
+	mb_map_size = roundup(mb_map_size, JMB_PG_BUCKET * PAGE_SIZE);
+	mb_list_jumbo.ml_btable = malloc((unsigned long)mb_map_size /
+	    (JMB_PG_BUCKET * PAGE_SIZE) * sizeof(struct mb_bucket *),
+	    M_MBUF, M_NOWAIT);
+	if (mb_list_jumbo.ml_btable == NULL)
+		goto bad;
+	mb_list_jumbo.ml_map = kmem_suballoc(kmem_map,
+	    &(mb_list_jumbo.ml_mapbase), &(mb_list_jumbo.ml_maptop),
+	    mb_map_size);
+	mb_list_jumbo.ml_map->system_map = 1;
+	mb_list_jumbo.ml_mapfull = 0;
+	mb_list_jumbo.ml_objsize = MJUMBOSIZE;
+	mb_list_jumbo.ml_wmhigh = &jumbo_limit;
+	mb_list_jumbo.ml_bucksize = JMB_PG_BUCKET * PAGE_SIZE;
+	mb_list_jumbo.ml_genlist = malloc(sizeof(struct mb_gen_list),
+	    M_MBUF, M_NOWAIT);
+	if (mb_list_jumbo.ml_genlist == NULL)
+		goto bad;
+	cv_init(&(mb_list_jumbo.ml_genlist->mgl_mstarved),
+	    "jumbo cluster pool starved");
+	mb_list_jumbo.ml_genlist->mb_cont.mc_lock = &mbuf_gen;
+	mb_list_jumbo.ml_genlist->mb_cont.mc_numowner = MB_GENLIST_OWNER;
+	mb_list_jumbo.ml_genlist->mb_cont.mc_starved = 0;
+	mb_list_jumbo.ml_genlist->mb_cont.mc_objcount =
+	    &(mb_statpcpu[MB_GENLIST_OWNER].mb_clfree);
+	mb_list_jumbo.ml_genlist->mb_cont.mc_numpgs =
+	    &(mb_statpcpu[MB_GENLIST_OWNER].mb_clpgs);
+	mb_list_jumbo.ml_genlist->mb_cont.mc_types = NULL;
+	SLIST_INIT(&(mb_list_jumbo.ml_genlist->mb_cont.mc_bhead));
+#endif
+
 	/*
 	 * Allocate and initialize PCPU containers.
 	 */
@@ -492,6 +574,30 @@
 				goto bad;
 		}
 		MB_UNLOCK_CONT(pcpu_cnt);
+
+#ifdef NMBJUMBOBUFS
+		mb_list_jumbo.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
+		    M_MBUF, M_NOWAIT);
+		if (mb_list_jumbo.ml_cntlst[i] == NULL)
+			goto bad;
+		mb_list_jumbo.ml_cntlst[i]->mb_cont.mc_lock = &mbuf_pcpu[i];
+		mb_list_jumbo.ml_cntlst[i]->mb_cont.mc_numowner = i;
+		mb_list_jumbo.ml_cntlst[i]->mb_cont.mc_starved = 0;
+		mb_list_jumbo.ml_cntlst[i]->mb_cont.mc_objcount =
+		    &(mb_statpcpu[i].mb_jbfree);
+		mb_list_jumbo.ml_cntlst[i]->mb_cont.mc_numpgs =
+		    &(mb_statpcpu[i].mb_jbpgs);
+		mb_list_jumbo.ml_cntlst[i]->mb_cont.mc_types = NULL;
+		SLIST_INIT(&(mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_bhead));
+		pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_jumbo, i);
+		MB_LOCK_CONT(pcpu_cnt);
+		for (j = 0; j < NMB_MBUF_INIT; j++) {
+			if (mb_pop_cont(&mb_list_jumbo, M_DONTWAIT, pcpu_cnt)
+			    == NULL)
+				goto bad;
+		}
+		MB_UNLOCK_CONT(pcpu_cnt);
+#endif
 	}
 
 	return;
@@ -527,12 +633,12 @@
 		return (NULL);
 
 	bucket = malloc(sizeof(struct mb_bucket) +
-	    PAGE_SIZE / mb_list->ml_objsize * sizeof(void *), M_MBUF,
+	    mb_list->ml_bucksize / mb_list->ml_objsize * sizeof(void *), M_MBUF,
 	    how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
 	if (bucket == NULL)
 		return (NULL);
 
-	p = (caddr_t)kmem_malloc(mb_list->ml_map, PAGE_SIZE,
+	p = (caddr_t)kmem_malloc(mb_list->ml_map, mb_list->ml_bucksize,
 	    how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
 	if (p == NULL) {
 		free(bucket, M_MBUF);
@@ -542,8 +648,9 @@
 	}
 
 	bucket->mb_numfree = 0;
-	mb_list->ml_btable[MB_BUCKET_INDX(p, mb_list)] = bucket;
-	for (i = 0; i < (PAGE_SIZE / mb_list->ml_objsize); i++) {
+	mb_list->ml_btable[MB_BUCKET_INDX(p, mb_list,
+	    mb_list->ml_bucksize)] = bucket;
+	for (i = 0; i < (mb_list->ml_bucksize / mb_list->ml_objsize); i++) {
 		bucket->mb_free[i] = p;
 		bucket->mb_numfree++;
 		p += mb_list->ml_objsize;
@@ -805,7 +912,8 @@
 	struct mb_bucket *bucket;
 	u_int owner;
 
-	bucket = mb_list->ml_btable[MB_BUCKET_INDX(m, mb_list)];
+	bucket = mb_list->ml_btable[MB_BUCKET_INDX(m, mb_list,
+	    mb_list->ml_bucksize)];
 
 	/*
 	 * Make sure that if after we lock the bucket's present container the
@@ -957,9 +1065,9 @@
 			 * being freed in an effort to keep the mbtypes
 			 * counters approximately balanced across all lists.
 			 */ 
-			MB_MBTYPES_DEC(cnt_lst, type, (PAGE_SIZE /
+			MB_MBTYPES_DEC(cnt_lst, type, (mb_list->ml_bucksize /
 			    mb_list->ml_objsize) - bucket->mb_numfree);
-			MB_MBTYPES_INC(gen_list, type, (PAGE_SIZE /
+			MB_MBTYPES_INC(gen_list, type, (mb_list->ml_bucksize /
 			    mb_list->ml_objsize) - bucket->mb_numfree);
  
 			MB_UNLOCK_CONT(gen_list);
Index: src/sys/sys/mbuf.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/mbuf.h,v
retrieving revision 1.98
diff -u -r1.98 mbuf.h
--- src/sys/sys/mbuf.h	30 Jul 2002 22:03:57 -0000	1.98
+++ src/sys/sys/mbuf.h	9 Aug 2002 19:06:05 -0000
@@ -44,9 +44,9 @@
 #include <sys/mac.h>
 
 /*
- * Mbufs are of a single size, MSIZE (machine/param.h), which
+ * Mbufs are of a single size, MSIZE (sys/param.h), which
  * includes overhead.  An mbuf may add a single "mbuf cluster" of size
- * MCLBYTES (also in machine/param.h), which has no additional overhead
+ * MCLBYTES (also in sys/param.h), which has no additional overhead
  * and is used instead of the internal data area; this is done when
  * at least MINCLSIZE of data must be stored.  Additionally, it is possible
  * to allocate a separate buffer externally and attach it to the mbuf in
@@ -57,6 +57,16 @@
 #define	MINCLSIZE	(MHLEN + 1)	/* smallest amount to put in cluster */
 #define	M_MAXCOMPRESS	(MHLEN / 2)	/* max amount to copy for compression */
 
+/*
+ * Jumbo clusters/buffers are (9216 - sizeof(u_int)) bytes in size (the
+ * trailing u_int is used as a ref. count).  They are _virtually_
+ * contiguous data regions that can be attached to mbufs.  They are
+ * typically used for large >9K frames with devices that can do
+ * scatter/gather.  MJUMBOBYTES is the size of the actual data region.
+ */
+#define	MJUMBOSIZE	9216
+#define	MJUMBOBYTES	(MJUMBOSIZE - sizeof(u_int))
+
 #ifdef _KERNEL
 /*-
  * Macros for type conversion:
@@ -225,6 +235,8 @@
 	u_long	mb_mbpgs;
 	u_long	mb_clfree;
 	u_long	mb_clpgs;
+	u_long	mb_jbfree;
+	u_long	mb_jbpgs;
 	long	mb_mbtypes[MT_NTYPES];
 	short	mb_active;
 };
@@ -245,6 +257,7 @@
 	u_long	m_mpfail;	/* XXX: times m_pullup failed */
 	u_long	m_msize;	/* length of an mbuf */
 	u_long	m_mclbytes;	/* length of an mbuf cluster */
+	u_long	m_mjumbobytes;	/* length of a jumbo cluster */
 	u_long	m_minclsize;	/* min length of data to allocate a cluster */
 	u_long	m_mlen;		/* length of data in an mbuf */
 	u_long	m_mhlen;	/* length of data in a header mbuf */
@@ -462,6 +475,7 @@
 extern	int nmbclusters;		/* Maximum number of clusters */
 extern	int nmbcnt;			/* Scale kmem_map for counter space */
 extern	int nmbufs;			/* Maximum number of mbufs */
+extern	int nmbjumbobufs;		/* Maximum number of jumbo clusters */
 extern	int nsfbufs;			/* Number of sendfile(2) bufs */
 
 void		 _mext_free(struct mbuf *);

--X1bOJ3K7DJ5YkBrT--

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-net" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20020809151627.A88180>