Date: Fri, 9 Aug 2002 15:16:27 -0400 From: Bosko Milekic <bmilekic@unixdaemons.com> To: gallatin@freebsd.org, ken@freebsd.org Cc: freebsd-net@freebsd.org Subject: Jumbo Clusters in mb_alloc Message-ID: <20020809151627.A88180@unixdaemons.com>
next in thread | raw e-mail | index | archive | help
[-- Attachment #1 --]
Hi guys,
Can you put this (the attached) to use?
It's the implementation of jumbo clusters within mb_alloc, as
discussed on -net not too long ago.
I have not written the allocation interface yet as I would like to
know if you could put them to use in some of the drivers, and also
what you would need from the interface.
On a parallel note, I've been thinking about [not now, but
eventually] moving the virtual address "sf_buf" allocations to
mb_alloc. I have an idea of how to do this easily but have not
bothered yet because -CURRENT is still largely under Giant and I
cannot accurately measure potential performance benefits yet. I've
been thinking the same about the uipc_jumbo stuff, because they look
a lot like sf_bufs. The trick would be to set map_starved for their
address space map in mb_alloc to 1 and then mb_alloc would think
that the map is "starved" and never try to kmem_malloc() from it (in
fact, it doesn't even have to be a "map," it just has to be some
sort of pre-allocated space, for example, with
kmem_alloc_pageable()). Then, mb_alloc would populate the per-CPU
caches and the general cache with the virtual address PAGE_SIZE
"buffers" (they have no ptes in pmap so they're not really "buffers"
as they don't have physical memory associated with them at this
point). Anyway, I think this would work but am delaying it until
a while after 5.0, when I can really measure stuff on SMP.
Anyway, see the attached patch and let me know if this is useful to
you. Thanks!
Cheers,
--
Bosko Milekic * bmilekic@unixdaemons.com * bmilekic@FreeBSD.org
[-- Attachment #2 --]
Index: src/sys/conf/NOTES
===================================================================
RCS file: /home/ncvs/src/sys/conf/NOTES,v
retrieving revision 1.1061
diff -u -r1.1061 NOTES
--- src/sys/conf/NOTES 9 Aug 2002 15:30:47 -0000 1.1061
+++ src/sys/conf/NOTES 9 Aug 2002 19:05:39 -0000
@@ -2252,6 +2252,7 @@
options NBUF=512 # Number of buffer headers
options NMBCLUSTERS=1024 # Number of mbuf clusters
+options NMBJUMBOBUFS=128 # Number of jumbo clusters
options SCSI_NCR_DEBUG
options SCSI_NCR_MAX_SYNC=10000
Index: src/sys/conf/options
===================================================================
RCS file: /home/ncvs/src/sys/conf/options,v
retrieving revision 1.341
diff -u -r1.341 options
--- src/sys/conf/options 3 Aug 2002 00:19:58 -0000 1.341
+++ src/sys/conf/options 9 Aug 2002 19:05:45 -0000
@@ -197,6 +197,7 @@
MAXFILES opt_param.h
NBUF opt_param.h
NMBCLUSTERS opt_param.h
+NMBJUMBOBUFS opt_param.h
NSFBUFS opt_param.h
VM_BCACHE_SIZE_MAX opt_param.h
VM_SWZONE_SIZE_MAX opt_param.h
Index: src/sys/kern/kern_malloc.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/kern_malloc.c,v
retrieving revision 1.111
diff -u -r1.111 kern_malloc.c
--- src/sys/kern/kern_malloc.c 31 May 2002 09:41:09 -0000 1.111
+++ src/sys/kern/kern_malloc.c 9 Aug 2002 19:05:52 -0000
@@ -335,6 +335,7 @@
u_long mem_size;
void *hashmem;
u_long hashsize;
+ u_int mb_size;
int highbit;
int bits;
int i;
@@ -385,9 +386,12 @@
* amount to slightly more address space than we need for the submaps,
* but it never hurts to have an extra page in kmem_map.
*/
- npg = (nmbufs * MSIZE + nmbclusters * MCLBYTES + nmbcnt *
- sizeof(u_int) + vm_kmem_size) / PAGE_SIZE;
-
+ mb_size = nmbufs * MSIZE + nmbclusters * MCLBYTES + nmbcnt *
+ sizeof(u_int);
+#ifdef NMBJUMBOBUFS
+ mb_size += nmbjumbobufs * MJUMBOSIZE;
+#endif
+ npg = (mb_size + vm_kmem_size) / PAGE_SIZE;
kmem_map = kmem_suballoc(kernel_map, (vm_offset_t *)&kmembase,
(vm_offset_t *)&kmemlimit, (vm_size_t)(npg * PAGE_SIZE));
kmem_map->system_map = 1;
Index: src/sys/kern/subr_mbuf.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/subr_mbuf.c,v
retrieving revision 1.29
diff -u -r1.29 subr_mbuf.c
--- src/sys/kern/subr_mbuf.c 8 Aug 2002 13:31:57 -0000 1.29
+++ src/sys/kern/subr_mbuf.c 9 Aug 2002 19:05:59 -0000
@@ -151,6 +151,9 @@
int nmbclusters;
int nmbcnt;
int nsfbufs;
+#ifdef NMBJUMBOBUFS
+int nmbjumbobufs;
+#endif
/*
* Perform sanity checks of tunables declared above.
@@ -170,6 +173,10 @@
TUNABLE_INT_FETCH("kern.ipc.nsfbufs", &nsfbufs);
nmbcnt = NMBCNTS;
TUNABLE_INT_FETCH("kern.ipc.nmbcnt", &nmbcnt);
+#ifdef NMBJUMBOBUFS
+ nmbjumbobufs = NMBJUMBOBUFS;
+ TUNABLE_INT_FETCH("kern.ipc.nmbjumbobufs", &nmbjumbobufs);
+#endif
/* Sanity checks */
if (nmbufs < nmbclusters * 2)
nmbufs = nmbclusters * 2;
@@ -197,11 +204,15 @@
vm_offset_t ml_maptop;
int ml_mapfull;
u_int ml_objsize;
+ u_int ml_bucksize;
u_int *ml_wmhigh;
};
static struct mb_lstmngr mb_list_mbuf, mb_list_clust;
static struct mtx mbuf_gen, mbuf_pcpu[NCPU];
u_int *cl_refcntmap;
+#ifdef NMBJUMBOBUFS
+static struct mb_lstmngr mb_list_jumbo;
+#endif
/*
* Local macros for internal allocator structure manipulations.
@@ -221,8 +232,8 @@
#define MB_GET_PCPU_LIST_NUM(mb_lst, num) \
(mb_lst)->ml_cntlst[(num)]
-#define MB_BUCKET_INDX(mb_obj, mb_lst) \
- (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) / PAGE_SIZE)
+#define MB_BUCKET_INDX(mb_obj, mb_lst, mb_div) \
+ (int)(((caddr_t)(mb_obj) - (caddr_t)(mb_lst)->ml_mapbase) / (mb_div))
#define MB_GET_OBJECT(mb_objp, mb_bckt, mb_lst) \
{ \
@@ -271,6 +282,9 @@
static u_int mbuf_limit = 512; /* Upper limit on # of mbufs per CPU. */
static u_int clust_limit = 128; /* Upper limit on # of clusters per CPU. */
+#ifdef NMBJUMBOBUFS
+static u_int jumbo_limit = 32; /* Upper limit on # of jumboclusts per CPU. */
+#endif
/*
* Objects exported by sysctl(8).
@@ -294,6 +308,12 @@
"Mbuf general information and statistics");
SYSCTL_OPAQUE(_kern_ipc, OID_AUTO, mb_statpcpu, CTLFLAG_RD, mb_statpcpu,
sizeof(mb_statpcpu), "S,", "Mbuf allocator per CPU statistics");
+#ifdef NMBJUMBOBUFS
+SYSCTL_INT(_kern_ipc, OID_AUTO, nmbjumbobufs, CTLFLAG_RD, &nmbjumbobufs, 0,
+ "Maximum number of jumbo clusters available");
+SYSCTL_UINT(_kern_ipc, OID_AUTO, jumbo_limit, CTLFLAG_RW, &jumbo_limit, 0,
+ "Upper limit on number of jumbo clusters allowed on each PCPU list");
+#endif
/*
* Prototypes of local allocator routines.
@@ -311,6 +331,32 @@
*/
#define NMB_MBUF_INIT 4
#define NMB_CLUST_INIT 16
+#ifdef NMBJUMBOBUFS
+#define NMB_JUMBO_INIT 1
+
+/*
+ * Do not change this unless you know EXACTLY what you're doing. This is
+ * the pre-calculated number of pages of jumbo clusters to allocate per
+ * "bucket." Here's how it works:
+ *
+ * - MJUMBOSIZE is a constant, and we picked it to be 9216 bytes. This should
+ * be enough to accomodate large 9K frames, and a reference counter for them.
+ * - 'n' is the number of jumbo clusters per bucket.
+ *
+ * For minimum space wastage to occur, we need:
+ * (MJUMBOSIZE * n) % PAGE_SIZE == 0.
+ * We want to pick the smallest possible 'n' so that our buckets don't span
+ * too much space. For smallest PAGE_SIZE of 4K (like on i386, for example),
+ * 'n' is 4, and this means that we will need:
+ * (MJUMBOSIZE * 4 / PAGE_SIZE) = JMB_PG_BUCKET = 9 pages per bucket.
+ * This corresponds to the smallest 'n' that we can find for a 4K page size.
+ * For a larger page size (e.g., 8K), we just have PAGE_SIZE = 2 * 4096,
+ * so n = 2 * 4 = 8. We can calculate 'n' at runtime for our page size,
+ * as long as PAGE_SIZE is a multiple of 2, and as long as we define
+ * JMB_PG_BUCKET here appropriately.
+ */
+#define JMB_PG_BUCKET 9
+#endif
/*
* Internal flags that allow for cache locks to remain "persistent" across
@@ -357,6 +403,7 @@
mb_list_mbuf.ml_mapfull = 0;
mb_list_mbuf.ml_objsize = MSIZE;
mb_list_mbuf.ml_wmhigh = &mbuf_limit;
+ mb_list_mbuf.ml_bucksize = PAGE_SIZE;
mb_map_size = (vm_size_t)(nmbclusters * MCLBYTES);
mb_map_size = rounddown(mb_map_size, PAGE_SIZE);
@@ -371,6 +418,7 @@
mb_list_clust.ml_mapfull = 0;
mb_list_clust.ml_objsize = MCLBYTES;
mb_list_clust.ml_wmhigh = &clust_limit;
+ mb_list_clust.ml_bucksize = PAGE_SIZE;
/*
* Allocate required general (global) containers for each object type.
@@ -428,11 +476,45 @@
*/
mbstat.m_msize = MSIZE;
mbstat.m_mclbytes = MCLBYTES;
+ mbstat.m_mjumbobytes = MJUMBOBYTES;
mbstat.m_minclsize = MINCLSIZE;
mbstat.m_mlen = MLEN;
mbstat.m_mhlen = MHLEN;
mbstat.m_numtypes = MT_NTYPES;
+#ifdef NMBJUMBOBUFS
+ mb_map_size = (vm_size_t)(nmbjumbobufs * MJUMBOSIZE);
+ mb_map_size = roundup(mb_map_size, JMB_PG_BUCKET * PAGE_SIZE);
+ mb_list_jumbo.ml_btable = malloc((unsigned long)mb_map_size /
+ (JMB_PG_BUCKET * PAGE_SIZE) * sizeof(struct mb_bucket *),
+ M_MBUF, M_NOWAIT);
+ if (mb_list_jumbo.ml_btable == NULL)
+ goto bad;
+ mb_list_jumbo.ml_map = kmem_suballoc(kmem_map,
+ &(mb_list_jumbo.ml_mapbase), &(mb_list_jumbo.ml_maptop),
+ mb_map_size);
+ mb_list_jumbo.ml_map->system_map = 1;
+ mb_list_jumbo.ml_mapfull = 0;
+ mb_list_jumbo.ml_objsize = MJUMBOSIZE;
+ mb_list_jumbo.ml_wmhigh = &jumbo_limit;
+ mb_list_jumbo.ml_bucksize = JMB_PG_BUCKET * PAGE_SIZE;
+ mb_list_jumbo.ml_genlist = malloc(sizeof(struct mb_gen_list),
+ M_MBUF, M_NOWAIT);
+ if (mb_list_jumbo.ml_genlist == NULL)
+ goto bad;
+ cv_init(&(mb_list_jumbo.ml_genlist->mgl_mstarved),
+ "jumbo cluster pool starved");
+ mb_list_jumbo.ml_genlist->mb_cont.mc_lock = &mbuf_gen;
+ mb_list_jumbo.ml_genlist->mb_cont.mc_numowner = MB_GENLIST_OWNER;
+ mb_list_jumbo.ml_genlist->mb_cont.mc_starved = 0;
+ mb_list_jumbo.ml_genlist->mb_cont.mc_objcount =
+ &(mb_statpcpu[MB_GENLIST_OWNER].mb_clfree);
+ mb_list_jumbo.ml_genlist->mb_cont.mc_numpgs =
+ &(mb_statpcpu[MB_GENLIST_OWNER].mb_clpgs);
+ mb_list_jumbo.ml_genlist->mb_cont.mc_types = NULL;
+ SLIST_INIT(&(mb_list_jumbo.ml_genlist->mb_cont.mc_bhead));
+#endif
+
/*
* Allocate and initialize PCPU containers.
*/
@@ -492,6 +574,30 @@
goto bad;
}
MB_UNLOCK_CONT(pcpu_cnt);
+
+#ifdef NMBJUMBOBUFS
+ mb_list_jumbo.ml_cntlst[i] = malloc(sizeof(struct mb_pcpu_list),
+ M_MBUF, M_NOWAIT);
+ if (mb_list_jumbo.ml_cntlst[i] == NULL)
+ goto bad;
+ mb_list_jumbo.ml_cntlst[i]->mb_cont.mc_lock = &mbuf_pcpu[i];
+ mb_list_jumbo.ml_cntlst[i]->mb_cont.mc_numowner = i;
+ mb_list_jumbo.ml_cntlst[i]->mb_cont.mc_starved = 0;
+ mb_list_jumbo.ml_cntlst[i]->mb_cont.mc_objcount =
+ &(mb_statpcpu[i].mb_jbfree);
+ mb_list_jumbo.ml_cntlst[i]->mb_cont.mc_numpgs =
+ &(mb_statpcpu[i].mb_jbpgs);
+ mb_list_jumbo.ml_cntlst[i]->mb_cont.mc_types = NULL;
+ SLIST_INIT(&(mb_list_mbuf.ml_cntlst[i]->mb_cont.mc_bhead));
+ pcpu_cnt = MB_GET_PCPU_LIST_NUM(&mb_list_jumbo, i);
+ MB_LOCK_CONT(pcpu_cnt);
+ for (j = 0; j < NMB_MBUF_INIT; j++) {
+ if (mb_pop_cont(&mb_list_jumbo, M_DONTWAIT, pcpu_cnt)
+ == NULL)
+ goto bad;
+ }
+ MB_UNLOCK_CONT(pcpu_cnt);
+#endif
}
return;
@@ -527,12 +633,12 @@
return (NULL);
bucket = malloc(sizeof(struct mb_bucket) +
- PAGE_SIZE / mb_list->ml_objsize * sizeof(void *), M_MBUF,
+ mb_list->ml_bucksize / mb_list->ml_objsize * sizeof(void *), M_MBUF,
how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
if (bucket == NULL)
return (NULL);
- p = (caddr_t)kmem_malloc(mb_list->ml_map, PAGE_SIZE,
+ p = (caddr_t)kmem_malloc(mb_list->ml_map, mb_list->ml_bucksize,
how == M_TRYWAIT ? M_WAITOK : M_NOWAIT);
if (p == NULL) {
free(bucket, M_MBUF);
@@ -542,8 +648,9 @@
}
bucket->mb_numfree = 0;
- mb_list->ml_btable[MB_BUCKET_INDX(p, mb_list)] = bucket;
- for (i = 0; i < (PAGE_SIZE / mb_list->ml_objsize); i++) {
+ mb_list->ml_btable[MB_BUCKET_INDX(p, mb_list,
+ mb_list->ml_bucksize)] = bucket;
+ for (i = 0; i < (mb_list->ml_bucksize / mb_list->ml_objsize); i++) {
bucket->mb_free[i] = p;
bucket->mb_numfree++;
p += mb_list->ml_objsize;
@@ -805,7 +912,8 @@
struct mb_bucket *bucket;
u_int owner;
- bucket = mb_list->ml_btable[MB_BUCKET_INDX(m, mb_list)];
+ bucket = mb_list->ml_btable[MB_BUCKET_INDX(m, mb_list,
+ mb_list->ml_bucksize)];
/*
* Make sure that if after we lock the bucket's present container the
@@ -957,9 +1065,9 @@
* being freed in an effort to keep the mbtypes
* counters approximately balanced across all lists.
*/
- MB_MBTYPES_DEC(cnt_lst, type, (PAGE_SIZE /
+ MB_MBTYPES_DEC(cnt_lst, type, (mb_list->ml_bucksize /
mb_list->ml_objsize) - bucket->mb_numfree);
- MB_MBTYPES_INC(gen_list, type, (PAGE_SIZE /
+ MB_MBTYPES_INC(gen_list, type, (mb_list->ml_bucksize /
mb_list->ml_objsize) - bucket->mb_numfree);
MB_UNLOCK_CONT(gen_list);
Index: src/sys/sys/mbuf.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/mbuf.h,v
retrieving revision 1.98
diff -u -r1.98 mbuf.h
--- src/sys/sys/mbuf.h 30 Jul 2002 22:03:57 -0000 1.98
+++ src/sys/sys/mbuf.h 9 Aug 2002 19:06:05 -0000
@@ -44,9 +44,9 @@
#include <sys/mac.h>
/*
- * Mbufs are of a single size, MSIZE (machine/param.h), which
+ * Mbufs are of a single size, MSIZE (sys/param.h), which
* includes overhead. An mbuf may add a single "mbuf cluster" of size
- * MCLBYTES (also in machine/param.h), which has no additional overhead
+ * MCLBYTES (also in sys/param.h), which has no additional overhead
* and is used instead of the internal data area; this is done when
* at least MINCLSIZE of data must be stored. Additionally, it is possible
* to allocate a separate buffer externally and attach it to the mbuf in
@@ -57,6 +57,16 @@
#define MINCLSIZE (MHLEN + 1) /* smallest amount to put in cluster */
#define M_MAXCOMPRESS (MHLEN / 2) /* max amount to copy for compression */
+/*
+ * Jumbo clusters/buffers are (9216 - sizeof(u_int)) bytes in size (the
+ * trailing u_int is used as a ref. count). They are _virtually_
+ * contiguous data regions that can be attached to mbufs. They are
+ * typically used for large >9K frames with devices that can do
+ * scatter/gather. MJUMBOBYTES is the size of the actual data region.
+ */
+#define MJUMBOSIZE 9216
+#define MJUMBOBYTES (MJUMBOSIZE - sizeof(u_int))
+
#ifdef _KERNEL
/*-
* Macros for type conversion:
@@ -225,6 +235,8 @@
u_long mb_mbpgs;
u_long mb_clfree;
u_long mb_clpgs;
+ u_long mb_jbfree;
+ u_long mb_jbpgs;
long mb_mbtypes[MT_NTYPES];
short mb_active;
};
@@ -245,6 +257,7 @@
u_long m_mpfail; /* XXX: times m_pullup failed */
u_long m_msize; /* length of an mbuf */
u_long m_mclbytes; /* length of an mbuf cluster */
+ u_long m_mjumbobytes; /* length of a jumbo cluster */
u_long m_minclsize; /* min length of data to allocate a cluster */
u_long m_mlen; /* length of data in an mbuf */
u_long m_mhlen; /* length of data in a header mbuf */
@@ -462,6 +475,7 @@
extern int nmbclusters; /* Maximum number of clusters */
extern int nmbcnt; /* Scale kmem_map for counter space */
extern int nmbufs; /* Maximum number of mbufs */
+extern int nmbjumbobufs; /* Maximum number of jumbo clusters */
extern int nsfbufs; /* Number of sendfile(2) bufs */
void _mext_free(struct mbuf *);
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20020809151627.A88180>
