Date: Mon, 1 Sep 2008 21:08:03 GMT From: Konrad Jankowski <konrad@FreeBSD.org> To: Perforce Change Reviews <perforce@FreeBSD.org> Subject: PERFORCE change 149020 for review Message-ID: <200809012108.m81L83GC020959@repoman.freebsd.org>
next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=149020 Change 149020 by konrad@vspredator on 2008/09/01 21:07:06 ifc Affected files ... .. //depot/projects/soc2008/konrad_collation/libc/include/libc_private.h#5 integrate .. //depot/projects/soc2008/konrad_collation/libc/locale/collate.c#9 edit .. //depot/projects/soc2008/konrad_collation/libc/stdlib/Makefile.inc#5 integrate .. //depot/projects/soc2008/konrad_collation/libc/stdlib/Symbol.map#5 integrate .. //depot/projects/soc2008/konrad_collation/libc/stdlib/malloc.3#5 integrate .. //depot/projects/soc2008/konrad_collation/libc/stdlib/malloc.c#6 integrate .. //depot/projects/soc2008/konrad_collation/libc/stdlib/ptsname.3#1 branch .. //depot/projects/soc2008/konrad_collation/libc/stdlib/ptsname.c#1 branch .. //depot/projects/soc2008/konrad_collation/libc/sys/Makefile.inc#5 integrate .. //depot/projects/soc2008/konrad_collation/libc/sys/Symbol.map#5 integrate .. //depot/projects/soc2008/konrad_collation/libc/sys/execve.2#5 integrate .. //depot/projects/soc2008/konrad_collation/libc/sys/getrlimit.2#5 integrate .. //depot/projects/soc2008/konrad_collation/libc/sys/posix_openpt.2#1 branch .. //depot/projects/soc2008/konrad_collation/libc/sys/wait.2#5 integrate Differences ... ==== //depot/projects/soc2008/konrad_collation/libc/include/libc_private.h#5 (text+ko) ==== @@ -26,7 +26,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: src/lib/libc/include/libc_private.h,v 1.19 2008/06/23 05:22:06 ed Exp $ + * $FreeBSD: src/lib/libc/include/libc_private.h,v 1.20 2008/08/27 02:00:53 jasone Exp $ * * Private definitions for libc, libc_r and libpthread. * @@ -158,6 +158,12 @@ extern const char *__progname; /* + * This function is used by the threading libraries to notify malloc that a + * thread is exiting. + */ +void _malloc_thread_cleanup(void); + +/* * These functions are used by the threading libraries in order to protect * malloc across fork(). */ ==== //depot/projects/soc2008/konrad_collation/libc/locale/collate.c#9 (text+ko) ==== @@ -128,7 +128,7 @@ } #if _BYTE_ORDER == _LITTLE_ENDIAN for(z = 0; z < info.directive_count; z++) { - info.undef_pri[z] = ntohl(info.undef_pri[z]); + info.undef_pri.pri[z] = ntohl(info.undef_pri.pri[z]); info.subst_count[z] = ntohl(info.subst_count[z]); } info.chain_count = ntohl(info.chain_count); @@ -220,10 +220,11 @@ } { struct __collate_st_chain_pri *p = TMP->__chain_pri_table; + for(i = chains; i-- > 0; p++) { wntohl(p->str, STR_LEN); for(z = 0; z < info.directive_count; z++) - p->pri[z] = ntohl(p->pri[z]); + p->pri.pri[z] = ntohl(p->pri.pri[z]); } } if (info.large_pri_count > 0) { @@ -239,7 +240,7 @@ (void)strcpy(TMP->__encoding, encoding); (void)memcpy(&TMP->__info, &info, sizeof(info)); __collate_data = TMP; - + __collate_load_error = (info.subst_count[0] > 0 || info.subst_count[1] > 0); __collate_load_error = 0; @@ -328,6 +329,7 @@ s++; } dest_str[len] = 0; + return (dest_str); } @@ -357,9 +359,11 @@ else high = next - 1; } + return NULL; } +/* XXX maybe just return struct __collate_st_char_pri? */ static struct __collate_st_large_char_pri * largesearch(const wchar_t key) { @@ -381,15 +385,21 @@ else high = next - 1; } + return NULL; } +#if 0 void __collate_lookup(const wchar_t *t, int *len, int *prim, int *sec) { struct __collate_st_chain_pri *p2; int l; + /* + * I didn't fix this function, as it is anyway uselesess in multiple + * weight collation. + */ *len = 1; *prim = *sec = 0; p2 = chainsearch(t, &l); @@ -417,38 +427,52 @@ *prim = (l = __collate_info->undef_pri[0]) >= 0 ? l : *t - l; *sec = (l = __collate_info->undef_pri[1]) >= 0 ? l : *t - l; } +#endif -void -__collate_lookup_which(const wchar_t *t, int *len, int *pri, int which) +struct __collate_st_char_pri * +__collate_lookup(const wchar_t *t, int *len) { struct __collate_st_chain_pri *p2; - int p, l; + struct __collate_st_large_char_pri *match; + int l; *len = 1; - *pri = 0; p2 = chainsearch(t, &l); if (p2) { + fprintf(stderr, "vsdebug: chainsearch succeeded"); + *len = l; + + return &p2->pri; +#if 0 + int p; + p = p2->pri[which]; /* use the chain if pri >= 0 */ if (p >= 0) { *len = l; *pri = p; + return; } +#endif } if (*t <= UCHAR_MAX) { +#if 0 *pri = __collate_char_pri_table[*t].pri[which]; - return; +#endif + return &__collate_char_pri_table[*t]; } if (__collate_info->large_pri_count > 0) { - struct __collate_st_large_char_pri *match; + match = largesearch(*t); if (match) { - *pri = match->pri.pri[which]; - return; + return &match->pri; } } +#if 0 *pri = (l = __collate_info->undef_pri[which]) >= 0 ? l : *t - l; +#endif + return &__collate_info->undef_pri; } wchar_t * @@ -482,6 +506,7 @@ if ((wcs = (wchar_t *)malloc(len * sizeof(wchar_t))) == NULL) __collate_err(EX_OSERR, __func__); wcscpy(wcs, s); + return (wcs); } @@ -722,9 +747,10 @@ if (__collate_info->chain_count > 0) { struct __collate_st_chain_pri *match; int ll; + match = chainsearch(wname, &ll); if (match) { - e = match->pri[0]; + e = match->pri.pri[0]; if (e == 0) return IGNORE_EQUIV_CLASS; return e < 0 ? -e : e; @@ -786,7 +812,8 @@ } *wp = 0; if (len > 1 && (ch = chainsearch(buf, &i)) != NULL) { - int e = ch->pri[0]; + int e = ch->pri.pri[0]; + if (e < 0) e = -e; if (e == equiv_class) ==== //depot/projects/soc2008/konrad_collation/libc/stdlib/Makefile.inc#5 (text+ko) ==== @@ -1,14 +1,14 @@ # from @(#)Makefile.inc 8.3 (Berkeley) 2/4/95 -# $FreeBSD: src/lib/libc/stdlib/Makefile.inc,v 1.54 2007/07/04 00:00:39 scf Exp $ +# $FreeBSD: src/lib/libc/stdlib/Makefile.inc,v 1.55 2008/08/20 08:31:58 ed Exp $ # machine-independent stdlib sources .PATH: ${.CURDIR}/${MACHINE_ARCH}/stdlib ${.CURDIR}/stdlib MISRCS+=_Exit.c a64l.c abort.c abs.c atexit.c atof.c atoi.c atol.c atoll.c \ bsearch.c div.c exit.c getenv.c getopt.c getopt_long.c \ - getsubopt.c grantpt.c hcreate.c heapsort.c imaxabs.c imaxdiv.c \ + getsubopt.c hcreate.c heapsort.c imaxabs.c imaxdiv.c \ insque.c l64a.c labs.c ldiv.c llabs.c lldiv.c lsearch.c malloc.c \ - merge.c qsort.c qsort_r.c radixsort.c rand.c random.c \ + merge.c ptsname.c qsort.c qsort_r.c radixsort.c rand.c random.c \ reallocf.c realpath.c remque.c strfmon.c strtoimax.c \ strtol.c strtoll.c strtoq.c strtoul.c strtonum.c strtoull.c \ strtoumax.c strtouq.c system.c tdelete.c tfind.c tsearch.c twalk.c @@ -21,10 +21,10 @@ .endif MAN+= a64l.3 abort.3 abs.3 alloca.3 atexit.3 atof.3 atoi.3 atol.3 bsearch.3 \ - div.3 exit.3 getenv.3 getopt.3 getopt_long.3 getsubopt.3 grantpt.3 \ + div.3 exit.3 getenv.3 getopt.3 getopt_long.3 getsubopt.3 \ hcreate.3 imaxabs.3 imaxdiv.3 insque.3 labs.3 ldiv.3 llabs.3 lldiv.3 \ - lsearch.3 malloc.3 memory.3 posix_memalign.3 qsort.3 radixsort.3 \ - rand.3 random.3 \ + lsearch.3 malloc.3 memory.3 posix_memalign.3 ptsname.3 qsort.3 \ + radixsort.3 rand.3 random.3 \ realpath.3 strfmon.3 strtod.3 strtol.3 strtonum.3 strtoul.3 system.3 \ tsearch.3 @@ -33,10 +33,10 @@ MLINKS+=exit.3 _Exit.3 MLINKS+=getenv.3 putenv.3 getenv.3 setenv.3 getenv.3 unsetenv.3 MLINKS+=getopt_long.3 getopt_long_only.3 -MLINKS+=grantpt.3 posix_openpt.3 grantpt.3 ptsname.3 grantpt.3 unlockpt.3 MLINKS+=hcreate.3 hdestroy.3 hcreate.3 hsearch.3 MLINKS+=insque.3 remque.3 MLINKS+=lsearch.3 lfind.3 +MLINKS+=ptsname.3 grantpt.3 ptsname.3 unlockpt.3 MLINKS+=qsort.3 heapsort.3 qsort.3 mergesort.3 qsort.3 qsort_r.3 MLINKS+=rand.3 rand_r.3 rand.3 srand.3 rand.3 sranddev.3 MLINKS+=random.3 initstate.3 random.3 setstate.3 random.3 srandom.3 \ ==== //depot/projects/soc2008/konrad_collation/libc/stdlib/Symbol.map#5 (text) ==== @@ -1,5 +1,5 @@ /* - * $FreeBSD: src/lib/libc/stdlib/Symbol.map,v 1.7 2008/06/17 14:05:03 ed Exp $ + * $FreeBSD: src/lib/libc/stdlib/Symbol.map,v 1.9 2008/08/27 02:00:53 jasone Exp $ */ FBSD_1.0 { @@ -30,7 +30,6 @@ suboptarg; getsubopt; grantpt; - posix_openpt; ptsname; unlockpt; hcreate; @@ -94,6 +93,7 @@ }; FBSDprivate_1.0 { + _malloc_thread_cleanup; _malloc_prefork; _malloc_postfork; __system; ==== //depot/projects/soc2008/konrad_collation/libc/stdlib/malloc.3#5 (text+ko) ==== @@ -30,9 +30,9 @@ .\" SUCH DAMAGE. .\" .\" @(#)malloc.3 8.1 (Berkeley) 6/4/93 -.\" $FreeBSD: src/lib/libc/stdlib/malloc.3,v 1.78 2008/02/17 17:09:24 jasone Exp $ +.\" $FreeBSD: src/lib/libc/stdlib/malloc.3,v 1.79 2008/08/27 02:00:53 jasone Exp $ .\" -.Dd February 17, 2008 +.Dd August 26, 2008 .Dt MALLOC 3 .Os .Sh NAME @@ -154,7 +154,7 @@ implementation-dependent. .Sh TUNING Once, when the first call is made to one of these memory allocation -routines, various flags will be set or reset, which affect the +routines, various flags will be set or reset, which affects the workings of this allocator implementation. .Pp The @@ -196,6 +196,11 @@ Therefore, some applications may benefit from increasing or decreasing this threshold parameter. This option is not available for some configurations (non-PIC). +.It C +Double/halve the size of the maximum size class that is a multiple of the +cacheline size (64). +Above this size, subpage spacing (256 bytes) is used for size classes. +The default value is 512 bytes. .It D Use .Xr sbrk 2 @@ -214,6 +219,16 @@ The default is 512 pages per arena; .Ev MALLOC_OPTIONS=10f will prevent any dirty unused pages from accumulating. +.It G +When there are multiple threads, use thread-specific caching for objects that +are smaller than one page. +This option is enabled by default. +Thread-specific caching allows many allocations to be satisfied without +performing any thread synchronization, at the cost of increased memory use. +See the +.Dq R +option for related tuning information. +This option is not available for some configurations (non-PIC). .It J Each byte of new memory allocated by .Fn malloc , @@ -248,7 +263,7 @@ acquiring memory. .It N Double/halve the number of arenas. -The default number of arenas is four times the number of CPUs, or one if there +The default number of arenas is two times the number of CPUs, or one if there is a single CPU. .It P Various statistics are printed at program exit via an @@ -259,14 +274,18 @@ Therefore, this option should only be used with care; it is primarily intended as a performance tuning aid during application development. .It Q -Double/halve the size of the allocation quantum. -The default quantum is the minimum allowed by the architecture (typically 8 or -16 bytes). -.It S Double/halve the size of the maximum size class that is a multiple of the -quantum. -Above this size, power-of-two spacing is used for size classes. -The default value is 512 bytes. +quantum (8 or 16 bytes, depending on architecture). +Above this size, cacheline spacing is used for size classes. +The default value is 128 bytes. +.It R +Double/halve magazine size, which approximately doubles/halves the number of +rounds in each magazine. +Magazines are used by the thread-specific caching machinery to acquire and +release objects in bulk. +Increasing the magazine size decreases locking overhead, at the expense of +increased memory usage. +This option is not available for some configurations (non-PIC). .It U Generate .Dq utrace @@ -358,6 +377,13 @@ However, it may make sense to reduce the number of arenas if an application does not make much use of the allocation functions. .Pp +In addition to multiple arenas, this allocator supports thread-specific +caching for small objects (smaller than one page), in order to make it +possible to completely avoid synchronization for most small allocation requests. +Such caching allows very fast allocation in the common case, but it increases +memory usage and fragmentation, since a bounded number of objects can remain +allocated in each thread cache. +.Pp Memory is conceptually broken into equal-sized chunks, where the chunk size is a power of two that is greater than the page size. Chunks are always aligned to multiples of the chunk size. @@ -366,7 +392,7 @@ .Pp User objects are broken into three categories according to size: small, large, and huge. -Small objects are no larger than one half of a page. +Small objects are smaller than one page. Large objects are smaller than the chunk size. Huge objects are a multiple of the chunk size. Small and large objects are managed by arenas; huge objects are managed @@ -378,23 +404,24 @@ contiguous pages (unused, backing a set of small objects, or backing one large object). The combination of chunk alignment and chunk page maps makes it possible to -determine all metadata regarding small and large allocations in -constant and logarithmic time, respectively. +determine all metadata regarding small and large allocations in constant time. .Pp Small objects are managed in groups by page runs. Each run maintains a bitmap that tracks which regions are in use. -Allocation requests that are no more than half the quantum (see the +Allocation requests that are no more than half the quantum (8 or 16, depending +on architecture) are rounded up to the nearest power of two. +Allocation requests that are more than half the quantum, but no more than the +minimum cacheline-multiple size class (see the .Dq Q -option) are rounded up to the nearest power of two (typically 2, 4, or 8). -Allocation requests that are more than half the quantum, but no more than the -maximum quantum-multiple size class (see the -.Dq S option) are rounded up to the nearest multiple of the quantum. -Allocation requests that are larger than the maximum quantum-multiple size -class, but no larger than one half of a page, are rounded up to the nearest -power of two. -Allocation requests that are larger than half of a page, but small enough to -fit in an arena-managed chunk (see the +Allocation requests that are more than the minumum cacheline-multiple size +class, but no more than the minimum subpage-multiple size class (see the +.Dq C +option) are rounded up to the nearest multiple of the cacheline size (64). +Allocation requests that are more than the minimum subpage-multiple size class +are rounded up to the nearest multiple of the subpage size (256). +Allocation requests that are more than one page, but small enough to fit in +an arena-managed chunk (see the .Dq K option), are rounded up to the nearest run size. Allocation requests that are too large to fit in an arena-managed chunk are @@ -402,8 +429,8 @@ .Pp Allocations are packed tightly together, which can be an issue for multi-threaded applications. -If you need to assure that allocations do not suffer from cache line sharing, -round your allocation requests up to the nearest multiple of the cache line +If you need to assure that allocations do not suffer from cacheline sharing, +round your allocation requests up to the nearest multiple of the cacheline size. .Sh DEBUGGING MALLOC PROBLEMS The first thing to do is to set the ==== //depot/projects/soc2008/konrad_collation/libc/stdlib/malloc.c#6 (text+ko) ==== @@ -35,6 +35,9 @@ * + Multiple arenas are used if there are multiple CPUs, which reduces lock * contention and cache sloshing. * + * + Thread-specific caching is used if there are multiple threads, which + * reduces the amount of locking. + * * + Cache line sharing between arenas is avoided for internal data * structures. * @@ -48,37 +51,49 @@ * and a 16 byte quantum on a 32-bit system, the size classes in each category * are as follows: * - * |=====================================| - * | Category | Subcategory | Size | - * |=====================================| - * | Small | Tiny | 2 | - * | | | 4 | - * | | | 8 | - * | |----------------+---------| - * | | Quantum-spaced | 16 | - * | | | 32 | - * | | | 48 | - * | | | ... | - * | | | 480 | - * | | | 496 | - * | | | 512 | - * | |----------------+---------| - * | | Sub-page | 1 kB | - * | | | 2 kB | - * |=====================================| - * | Large | 4 kB | - * | | 8 kB | - * | | 12 kB | - * | | ... | - * | | 1012 kB | - * | | 1016 kB | - * | | 1020 kB | - * |=====================================| - * | Huge | 1 MB | - * | | 2 MB | - * | | 3 MB | - * | | ... | - * |=====================================| + * |=======================================| + * | Category | Subcategory | Size | + * |=======================================| + * | Small | Tiny | 2 | + * | | | 4 | + * | | | 8 | + * | |------------------+---------| + * | | Quantum-spaced | 16 | + * | | | 32 | + * | | | 48 | + * | | | ... | + * | | | 96 | + * | | | 112 | + * | | | 128 | + * | |------------------+---------| + * | | Cacheline-spaced | 192 | + * | | | 256 | + * | | | 320 | + * | | | 384 | + * | | | 448 | + * | | | 512 | + * | |------------------+---------| + * | | Sub-page | 760 | + * | | | 1024 | + * | | | 1280 | + * | | | ... | + * | | | 3328 | + * | | | 3584 | + * | | | 3840 | + * |=======================================| + * | Large | 4 kB | + * | | 8 kB | + * | | 12 kB | + * | | ... | + * | | 1012 kB | + * | | 1016 kB | + * | | 1020 kB | + * |=======================================| + * | Huge | 1 MB | + * | | 2 MB | + * | | 3 MB | + * | | ... | + * |=======================================| * * A different mechanism is used for each category: * @@ -113,6 +128,19 @@ #endif /* + * MALLOC_TINY enables support for tiny objects, which are smaller than one + * quantum. + */ +#define MALLOC_TINY + +/* + * MALLOC_MAG enables a magazine-based thread-specific caching layer for small + * objects. This makes it possible to allocate/deallocate objects without any + * locking when the cache is in the steady state. + */ +#define MALLOC_MAG + +/* * MALLOC_BALANCE enables monitoring of arena lock contention and dynamically * re-balances arena load if exponentially averaged contention exceeds a * certain threshold. @@ -128,7 +156,7 @@ #define MALLOC_DSS #include <sys/cdefs.h> -__FBSDID("$FreeBSD: src/lib/libc/stdlib/malloc.c,v 1.176 2008/08/14 17:31:42 jasone Exp $"); +__FBSDID("$FreeBSD: src/lib/libc/stdlib/malloc.c,v 1.177 2008/08/27 02:00:53 jasone Exp $"); #include "libc_private.h" #ifdef MALLOC_DEBUG @@ -184,46 +212,63 @@ /* Size of stack-allocated buffer passed to strerror_r(). */ #define STRERROR_BUF 64 -/* Minimum alignment of allocations is 2^QUANTUM_2POW_MIN bytes. */ +/* + * The const_size2bin table is sized according to PAGESIZE_2POW, but for + * correctness reasons, we never assume that + * (pagesize == (1U << * PAGESIZE_2POW)). + * + * Minimum alignment of allocations is 2^QUANTUM_2POW bytes. + */ #ifdef __i386__ -# define QUANTUM_2POW_MIN 4 +# define PAGESIZE_2POW 12 +# define QUANTUM_2POW 4 # define SIZEOF_PTR_2POW 2 # define CPU_SPINWAIT __asm__ volatile("pause") #endif #ifdef __ia64__ -# define QUANTUM_2POW_MIN 4 +# define PAGESIZE_2POW 12 +# define QUANTUM_2POW 4 # define SIZEOF_PTR_2POW 3 #endif #ifdef __alpha__ -# define QUANTUM_2POW_MIN 4 +# define PAGESIZE_2POW 13 +# define QUANTUM_2POW 4 # define SIZEOF_PTR_2POW 3 # define NO_TLS #endif #ifdef __sparc64__ -# define QUANTUM_2POW_MIN 4 +# define PAGESIZE_2POW 13 +# define QUANTUM_2POW 4 # define SIZEOF_PTR_2POW 3 # define NO_TLS #endif #ifdef __amd64__ -# define QUANTUM_2POW_MIN 4 +# define PAGESIZE_2POW 12 +# define QUANTUM_2POW 4 # define SIZEOF_PTR_2POW 3 # define CPU_SPINWAIT __asm__ volatile("pause") #endif #ifdef __arm__ -# define QUANTUM_2POW_MIN 3 +# define PAGESIZE_2POW 12 +# define QUANTUM_2POW 3 # define SIZEOF_PTR_2POW 2 # define NO_TLS #endif #ifdef __mips__ -# define QUANTUM_2POW_MIN 3 +# define PAGESIZE_2POW 12 +# define QUANTUM_2POW 3 # define SIZEOF_PTR_2POW 2 # define NO_TLS #endif #ifdef __powerpc__ -# define QUANTUM_2POW_MIN 4 +# define PAGESIZE_2POW 12 +# define QUANTUM_2POW 4 # define SIZEOF_PTR_2POW 2 #endif +#define QUANTUM ((size_t)(1U << QUANTUM_2POW)) +#define QUANTUM_MASK (QUANTUM - 1) + #define SIZEOF_PTR (1U << SIZEOF_PTR_2POW) /* sizeof(int) == (1U << SIZEOF_INT_2POW). */ @@ -237,6 +282,10 @@ #endif #ifdef NO_TLS + /* MALLOC_MAG requires TLS. */ +# ifdef MALLOC_MAG +# undef MALLOC_MAG +# endif /* MALLOC_BALANCE requires TLS. */ # ifdef MALLOC_BALANCE # undef MALLOC_BALANCE @@ -253,23 +302,42 @@ #define DIRTY_MAX_DEFAULT (1U << 9) /* - * Maximum size of L1 cache line. This is used to avoid cache line aliasing, - * so over-estimates are okay (up to a point), but under-estimates will - * negatively affect performance. + * Maximum size of L1 cache line. This is used to avoid cache line aliasing. + * In addition, this controls the spacing of cacheline-spaced size classes. */ #define CACHELINE_2POW 6 #define CACHELINE ((size_t)(1U << CACHELINE_2POW)) +#define CACHELINE_MASK (CACHELINE - 1) -/* Smallest size class to support. */ -#define TINY_MIN_2POW 1 +/* + * Subpages are an artificially designated partitioning of pages. Their only + * purpose is to support subpage-spaced size classes. + * + * There must be at least 4 subpages per page, due to the way size classes are + * handled. + */ +#define SUBPAGE_2POW 8 +#define SUBPAGE ((size_t)(1U << SUBPAGE_2POW)) +#define SUBPAGE_MASK (SUBPAGE - 1) + +#ifdef MALLOC_TINY + /* Smallest size class to support. */ +# define TINY_MIN_2POW 1 +#endif /* * Maximum size class that is a multiple of the quantum, but not (necessarily) * a power of 2. Above this size, allocations are rounded up to the nearest * power of 2. */ -#define SMALL_MAX_2POW_DEFAULT 9 -#define SMALL_MAX_DEFAULT (1U << SMALL_MAX_2POW_DEFAULT) +#define QSPACE_MAX_2POW_DEFAULT 7 + +/* + * Maximum size class that is a multiple of the cacheline, but not (necessarily) + * a power of 2. Above this size, allocations are rounded up to the nearest + * power of 2. + */ +#define CSPACE_MAX_2POW_DEFAULT 9 /* * RUN_MAX_OVRHD indicates maximum desired run header overhead. Runs are sized @@ -293,8 +361,7 @@ #define RUN_MAX_OVRHD_RELAX 0x00001800U /* Put a cap on small object run size. This overrides RUN_MAX_OVRHD. */ -#define RUN_MAX_SMALL_2POW 15 -#define RUN_MAX_SMALL (1U << RUN_MAX_SMALL_2POW) +#define RUN_MAX_SMALL (12 * pagesize) /* * Hyper-threaded CPUs may need a special instruction inside spin loops in @@ -319,6 +386,15 @@ */ #define BLOCK_COST_2POW 4 +#ifdef MALLOC_MAG + /* + * Default magazine size, in bytes. max_rounds is calculated to make + * optimal use of the space, leaving just enough room for the magazine + * header. + */ +# define MAG_SIZE_2POW_DEFAULT 9 +#endif + #ifdef MALLOC_BALANCE /* * We use an exponential moving average to track recent lock contention, @@ -369,6 +445,11 @@ */ uint64_t nrequests; +#ifdef MALLOC_MAG + /* Number of magazine reloads from this bin. */ + uint64_t nmags; +#endif + /* Total number of runs created for this bin's size class. */ uint64_t nruns; @@ -678,6 +759,35 @@ /******************************************************************************/ /* + * Magazine data structures. + */ + +#ifdef MALLOC_MAG +typedef struct mag_s mag_t; +struct mag_s { + size_t binind; /* Index of associated bin. */ + size_t nrounds; + void *rounds[1]; /* Dynamically sized. */ +}; + +/* + * Magazines are lazily allocated, but once created, they remain until the + * associated mag_rack is destroyed. + */ +typedef struct bin_mags_s bin_mags_t; +struct bin_mags_s { + mag_t *curmag; + mag_t *sparemag; +}; + +typedef struct mag_rack_s mag_rack_t; +struct mag_rack_s { + bin_mags_t bin_mags[1]; /* Dynamically sized. */ +}; +#endif + +/******************************************************************************/ +/* * Data. */ @@ -690,16 +800,147 @@ static size_t pagesize_2pow; /* Various bin-related settings. */ -static size_t bin_maxclass; /* Max size class for bins. */ -static unsigned ntbins; /* Number of (2^n)-spaced tiny bins. */ +#ifdef MALLOC_TINY /* Number of (2^n)-spaced tiny bins. */ +# define ntbins ((unsigned)(QUANTUM_2POW - TINY_MIN_2POW)) +#else +# define ntbins 0 +#endif static unsigned nqbins; /* Number of quantum-spaced bins. */ -static unsigned nsbins; /* Number of (2^n)-spaced sub-page bins. */ -static size_t small_min; -static size_t small_max; +static unsigned ncbins; /* Number of cacheline-spaced bins. */ +static unsigned nsbins; /* Number of subpage-spaced bins. */ +static unsigned nbins; +#ifdef MALLOC_TINY +# define tspace_max ((size_t)(QUANTUM >> 1)) +#endif +#define qspace_min QUANTUM +static size_t qspace_max; +static size_t cspace_min; +static size_t cspace_max; +static size_t sspace_min; +static size_t sspace_max; +#define bin_maxclass sspace_max + +static uint8_t const *size2bin; +/* + * const_size2bin is a static constant lookup table that in the common case can + * be used as-is for size2bin. For dynamically linked programs, this avoids + * a page of memory overhead per process. + */ +#define S2B_1(i) i, +#define S2B_2(i) S2B_1(i) S2B_1(i) +#define S2B_4(i) S2B_2(i) S2B_2(i) +#define S2B_8(i) S2B_4(i) S2B_4(i) +#define S2B_16(i) S2B_8(i) S2B_8(i) +#define S2B_32(i) S2B_16(i) S2B_16(i) +#define S2B_64(i) S2B_32(i) S2B_32(i) +#define S2B_128(i) S2B_64(i) S2B_64(i) +#define S2B_256(i) S2B_128(i) S2B_128(i) +static const uint8_t const_size2bin[(1U << PAGESIZE_2POW) - 255] = { + S2B_1(0xffU) /* 0 */ +#if (QUANTUM_2POW == 4) +/* 64-bit system ************************/ +# ifdef MALLOC_TINY + S2B_2(0) /* 2 */ + S2B_2(1) /* 4 */ + S2B_4(2) /* 8 */ + S2B_8(3) /* 16 */ +# define S2B_QMIN 3 +# else + S2B_16(0) /* 16 */ +# define S2B_QMIN 0 +# endif + S2B_16(S2B_QMIN + 1) /* 32 */ + S2B_16(S2B_QMIN + 2) /* 48 */ + S2B_16(S2B_QMIN + 3) /* 64 */ + S2B_16(S2B_QMIN + 4) /* 80 */ + S2B_16(S2B_QMIN + 5) /* 96 */ + S2B_16(S2B_QMIN + 6) /* 112 */ + S2B_16(S2B_QMIN + 7) /* 128 */ +# define S2B_CMIN (S2B_QMIN + 8) +#else +/* 32-bit system ************************/ +# ifdef MALLOC_TINY + S2B_2(0) /* 2 */ + S2B_2(1) /* 4 */ + S2B_4(2) /* 8 */ +# define S2B_QMIN 2 +# else + S2B_8(0) /* 8 */ +# define S2B_QMIN 0 +# endif + S2B_8(S2B_QMIN + 1) /* 16 */ + S2B_8(S2B_QMIN + 2) /* 24 */ + S2B_8(S2B_QMIN + 3) /* 32 */ + S2B_8(S2B_QMIN + 4) /* 40 */ + S2B_8(S2B_QMIN + 5) /* 48 */ + S2B_8(S2B_QMIN + 6) /* 56 */ + S2B_8(S2B_QMIN + 7) /* 64 */ + S2B_8(S2B_QMIN + 8) /* 72 */ + S2B_8(S2B_QMIN + 9) /* 80 */ + S2B_8(S2B_QMIN + 10) /* 88 */ + S2B_8(S2B_QMIN + 11) /* 96 */ + S2B_8(S2B_QMIN + 12) /* 104 */ + S2B_8(S2B_QMIN + 13) /* 112 */ + S2B_8(S2B_QMIN + 14) /* 120 */ + S2B_8(S2B_QMIN + 15) /* 128 */ +# define S2B_CMIN (S2B_QMIN + 16) +#endif +/****************************************/ + S2B_64(S2B_CMIN + 0) /* 192 */ + S2B_64(S2B_CMIN + 1) /* 256 */ + S2B_64(S2B_CMIN + 2) /* 320 */ + S2B_64(S2B_CMIN + 3) /* 384 */ + S2B_64(S2B_CMIN + 4) /* 448 */ + S2B_64(S2B_CMIN + 5) /* 512 */ +# define S2B_SMIN (S2B_CMIN + 6) + S2B_256(S2B_SMIN + 0) /* 768 */ + S2B_256(S2B_SMIN + 1) /* 1024 */ + S2B_256(S2B_SMIN + 2) /* 1280 */ + S2B_256(S2B_SMIN + 3) /* 1536 */ + S2B_256(S2B_SMIN + 4) /* 1792 */ + S2B_256(S2B_SMIN + 5) /* 2048 */ + S2B_256(S2B_SMIN + 6) /* 2304 */ + S2B_256(S2B_SMIN + 7) /* 2560 */ + S2B_256(S2B_SMIN + 8) /* 2816 */ + S2B_256(S2B_SMIN + 9) /* 3072 */ + S2B_256(S2B_SMIN + 10) /* 3328 */ + S2B_256(S2B_SMIN + 11) /* 3584 */ + S2B_256(S2B_SMIN + 12) /* 3840 */ +#if (PAGESIZE_2POW == 13) + S2B_256(S2B_SMIN + 13) /* 4096 */ + S2B_256(S2B_SMIN + 14) /* 4352 */ + S2B_256(S2B_SMIN + 15) /* 4608 */ + S2B_256(S2B_SMIN + 16) /* 4864 */ + S2B_256(S2B_SMIN + 17) /* 5120 */ + S2B_256(S2B_SMIN + 18) /* 5376 */ + S2B_256(S2B_SMIN + 19) /* 5632 */ + S2B_256(S2B_SMIN + 20) /* 5888 */ + S2B_256(S2B_SMIN + 21) /* 6144 */ + S2B_256(S2B_SMIN + 22) /* 6400 */ + S2B_256(S2B_SMIN + 23) /* 6656 */ + S2B_256(S2B_SMIN + 24) /* 6912 */ + S2B_256(S2B_SMIN + 25) /* 7168 */ + S2B_256(S2B_SMIN + 26) /* 7424 */ + S2B_256(S2B_SMIN + 27) /* 7680 */ + S2B_256(S2B_SMIN + 28) /* 7936 */ +#endif +}; +#undef S2B_1 +#undef S2B_2 +#undef S2B_4 +#undef S2B_8 +#undef S2B_16 +#undef S2B_32 +#undef S2B_64 +#undef S2B_128 +#undef S2B_256 +#undef S2B_QMIN +#undef S2B_CMIN +#undef S2B_SMIN -/* Various quantum-related settings. */ -static size_t quantum; -static size_t quantum_mask; /* (quantum - 1). */ +#ifdef MALLOC_MAG +static size_t max_rounds; +#endif /* Various chunk-related settings. */ static size_t chunksize; @@ -796,6 +1037,14 @@ static __thread arena_t *arenas_map; #endif +#ifdef MALLOC_MAG +/* + * Map of thread-specific magazine racks, used for thread-specific object + * caching. + */ +static __thread mag_rack_t *mag_rack; +#endif + #ifdef MALLOC_STATS /* Chunk statistics. */ static chunk_stats_t stats_chunks; @@ -818,13 +1067,17 @@ static bool opt_dss = true; static bool opt_mmap = true; #endif +#ifdef MALLOC_MAG +static bool opt_mag = true; +static size_t opt_mag_size_2pow = MAG_SIZE_2POW_DEFAULT; +#endif static size_t opt_dirty_max = DIRTY_MAX_DEFAULT; #ifdef MALLOC_BALANCE static uint64_t opt_balance_threshold = BALANCE_THRESHOLD_DEFAULT; #endif static bool opt_print_stats = false; -static size_t opt_quantum_2pow = QUANTUM_2POW_MIN; -static size_t opt_small_max_2pow = SMALL_MAX_2POW_DEFAULT; +static size_t opt_qspace_max_2pow = QSPACE_MAX_2POW_DEFAULT; +static size_t opt_cspace_max_2pow = CSPACE_MAX_2POW_DEFAULT; static size_t opt_chunk_2pow = CHUNK_2POW_DEFAULT; static bool opt_utrace = false; static bool opt_sysv = false; @@ -902,15 +1155,21 @@ static void arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk, arena_run_t *run, size_t oldsize, size_t newsize, bool dirty); static arena_run_t *arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin); -static void *arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin); -static size_t arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size); +static void *arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin); +static size_t arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size); #ifdef MALLOC_BALANCE static void arena_lock_balance_hard(arena_t *arena); #endif +#ifdef MALLOC_MAG +static void mag_load(mag_t *mag); +#endif static void *arena_malloc_large(arena_t *arena, size_t size, bool zero); static void *arena_palloc(arena_t *arena, size_t alignment, size_t size, size_t alloc_size); static size_t arena_salloc(const void *ptr); +#ifdef MALLOC_MAG +static void mag_unload(mag_t *mag); +#endif static void arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk, void *ptr); static void arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk, @@ -921,11 +1180,22 @@ static void *arena_ralloc(void *ptr, size_t size, size_t oldsize); static bool arena_new(arena_t *arena); static arena_t *arenas_extend(unsigned ind); +#ifdef MALLOC_MAG +static mag_t *mag_create(arena_t *arena, size_t binind); +static void mag_destroy(mag_t *mag); +static mag_rack_t *mag_rack_create(arena_t *arena); >>> TRUNCATED FOR MAIL (1000 lines) <<<
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200809012108.m81L83GC020959>