Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 8 Feb 2003 00:22:44 -0800 (PST)
From:      Matthew Dillon <dillon@apollo.backplane.com>
To:        Jeff Roberson <jroberson@chesapeake.net>
Cc:        Gary Thorpe <gathorpe79@yahoo.com>, Julian Elischer <julian@elischer.org>, Peter Wemm <peter@wemm.org>, Dag-Erling Smorgrav <des@ofug.org>, <arch@FreeBSD.ORG>
Subject:   Re: New kernel allocation API 
Message-ID:  <200302080822.h188Mi15005276@apollo.backplane.com>
References:   <20030207205529.K72073-100000@mail.chesapeake.net>

next in thread | previous in thread | raw e-mail | index | archive | help
:
:On Thu, 6 Feb 2003, Gary Thorpe wrote:
:
:> How will it make it faster? By jumping to the group of memory slabs
:> that would match that size?
:
:Yes.  Right now in malloc you have to look up the zone based on the
:address.  I have a hack that lets us do this in constant time.  It
:basically goes like this:

    If you take a look at /usr/src/lib/libstand/zalloc* you will see 
    that the allocation interface I wrote for libstand needs the 
    size passed to it in the freeing code.  For example.

    I've done it this way in virtually every allocator I've ever written,
    because it gives any allocator API *huge* flexibility, and I expect
    probably 99% of our kernel would trivially be able to pass the size
    to the freeing code.

    However, it is not precisely necessary from the point of view of a
    slab allocator.  If the slab allocator allocates large, aligned
    meta-blocks, then calculating the base of the slab is utterly trivial,
    e.g. it is just (ptr & (128 * 1024 - 1)), and the slab header can 
    contain the granularity for the slab.

    I've included slab code I've been working on.  The code is for another
    project, but, Jeff, you are welcome to snarf any bits that fit in
    with your allocator.  This code is somewhat tested but not life-tested.

    The code provides instant allocation and instant deallocation, O(1)
    time with *extreme* performance and good cache and TLB characteristics.
    It just so happens I have it conditionalized for standalone testing, just
    compile with -DSTANDALONE -DWITH_LIBC.  The only thing I haven't done
    yet is try to optimize detection of individual free pages (at least when
    the allocation size does not result in an allocation overlapping a page),
    but I have most of the infrastructure in place (which is also what gets
    me the good TLB hit characteristics) that would be required to do that.

						-Matt

/*
 * LIBDUX/SLABALLOC.C	- DUX memory management
 *
 *	This module implements a slab allocator.  It uses sys_valloc(),
 *	sys_valloc_aligned(), and sys_vfree() to manage the underlying
 *	memory.
 *
 *	A slab allocator reserves a ZONE for each chunk size, then lays the
 *	chunks out in an array in the zone.  Allocation and deallocation is
 *	near instantanious and fragmentation is minimized.  The downside is
 *	in VM usesage (~80 zones x 128K = 10MB) and a moderate, but
 *	reasonable amount of memory overhead in high volumes.  In low volumes
 *	memory overhead might be an issue, so a general purpose allocator 
 *	might use something more compact like the LVZ or HVZ allocator for 
 *	the first 128K (which can mix several chunk sizes in the same page),
 *	and then revert to this slab allocator for the remainder.
 *
 *	Alloc Size	Chunking        Number of zones
 *	0-127		8		16
 *	128-255		16		8
 *	256-511		32		8
 *	512-1023	64		8
 *	1024-2047	128		8
 *	2048-4095	256		8
 *	4096-8191	512		8
 *	8192-16383	1024		8
 *	16384-32767	2048		8
 *	(if PAGE_SIZE is 4K the maximum zone allocation is 16383)
 */

#if defined(WITH_LIBC)

#include <sys/types.h>
#include <sys/mman.h>
#include <sys/param.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include <unistd.h>

#define arysize(ary)	(sizeof(ary)/sizeof((ary)[0]))

#if defined(STANDALONE)
#define kassert(cond)	if (!(cond)) _kassert(__FILE__, __LINE__, #cond)
#ifndef DIAGNOSTIC
#define DIAGNOSTIC
#endif
#endif

#else

#include "defs.h"

Export void	slab_setup(void);
Export void	*slab_alloc(uintptr_t bytes);
Export void	slab_free(void *ptr, uintptr_t bytes);

#endif

#define MIN_CHUNK_SIZE		8
#define MIN_CHUNK_MASK		(MIN_CHUNK_SIZE - 1)
#define ZONE_SIZE		(128 * 1024)
#define ZONE_ALLOC_LIMIT	32768
#define ZONE_MAGIC		0x736c6162
#define ZONE_RELS_THRESH	2

#define IN_SAME_PAGE_MASK	(~(intptr_t)PAGE_MASK | MIN_CHUNK_MASK)

#if ZONE_ALLOC_LIMIT == 16384
#define NZONES			72
#elif ZONE_ALLOC_LIMIT == 32768
#define NZONES			80
#else
#error "I couldn't figure out NZONES"
#endif

typedef struct Chunk {
    struct Chunk *c_Next;
} Chunk;

typedef struct Zone {
    struct Zone *z_Next;	/* ZoneAry[] link if z_NFree non-zero */
    int		z_Magic;
    int		z_NFree;	/* total free chunks / ualloc space in zone */
    int		z_NMax;		/* maximum free chunks */
    int		z_UAlloc;	/* allocation offset into uninitialized space */
    int		z_ChunkSize;	/* chunk size for validation */
    int		z_FirstFreePg;	/* chunk list on a page-by-page basis */
    int		z_ZoneIndex;
    Chunk	*z_PageAry[ZONE_SIZE / PAGE_SIZE];
} Zone;

static Zone	*ZoneAry[NZONES];	/* linked list of zones NFree > 0 */
static Zone	*FreeZones;		/* whole zones that have become free */
static int	NFreeZones;

#if defined(WITH_LIBC)
static void *sys_valloc(uintptr_t bytes);
static void *sys_valloc_aligned(uintptr_t bytes);
static void sys_vfree(void *ptr, uintptr_t bytes);


#if defined(STANDALONE)
static void _kassert(const char *file, int line, const char *cond);
static void kpanic(const char *ctl, ...);
#endif
#endif

void
slab_setup(void)
{
}

static __inline int
zoneindex(uintptr_t *bytes)
{
    unsigned int n = (unsigned int)*bytes;	/* unsigned for shift opt */
    if (n < 128) {
	*bytes = n = (n + 7) & ~7;
	return(n / 8);
    }
    if (n < 256) {
	*bytes = n = (n + 15) & ~15;
	return(n / 16 + 8);
    }
    if (n < 8192) {
	if (n < 512) {
	    *bytes = n = (n + 31) & ~31;
	    return(n / 32 + 16);
	}
	if (n < 1024) {
	    *bytes = n = (n + 63) & ~63;
	    return(n / 64 + 24);
	} 
	if (n < 2048) {
	    *bytes = n = (n + 127) & ~127;
	    return(n / 128 + 32);
	}
	if (n < 4096) {
	    *bytes = n = (n + 255) & ~255;
	    return(n / 256 + 40);
	}
	*bytes = n = (n + 511) & ~511;
	return(n / 512 + 48);
    }
#if ZONE_ALLOC_LIMIT > 8192
    if (n < 16384) {
	*bytes = n = (n + 1023) & ~1023;
	return(n / 1024 + 56);
    }
#endif
#if ZONE_ALLOC_LIMIT > 16384
    if (n < 32768) {
	*bytes = n = (n + 2047) & ~2047;
	return(n / 2048 + 64);
    }
#endif
    kassert(0);		/* unexpected byte count */
    return(0);
}

void *
slab_alloc(uintptr_t bytes)
{
    Zone *z;
    Chunk *chunk;
    int zi;

    /*
     * Degenerate cases
     */
    kassert(bytes > 0);
    if ((bytes & PAGE_MASK) == 0 || bytes > ZONE_ALLOC_LIMIT) {
	chunk = sys_valloc((bytes + PAGE_MASK) & ~PAGE_MASK);
	return(chunk);
    }

    /*
     * Zone case.  Attempt to allocate out of an existing zone.  First
     * try the free list, then allocate out of unallocated space.  If we
     * find a good zone move it to the head of the list (we might have
     * thousands of zones in the list).
     */
    zi = zoneindex(&bytes);
    if ((z = ZoneAry[zi]) != NULL) {
	kassert(z->z_NFree > 0);

	/*
	 * Remove us from the ZoneAry[] when we become empty
	 */
	if (--z->z_NFree == 0) {
	    ZoneAry[zi] = z->z_Next;
	    z->z_Next = NULL;
#if defined(STANDALONE)
	    printf("Zone %p fully allocated\n", z);
#endif
	}

	/*
	 * Locate a chunk in a free page.  This attempts to localize
	 * reallocations into earlier pages without us having to sort
	 * the chunk list.  A chunk may still overlap a page boundary.
	 */
	while (z->z_FirstFreePg < arysize(z->z_PageAry)) {
	    if ((chunk = z->z_PageAry[z->z_FirstFreePg]) != NULL) {
#ifdef DIAGNOSTIC
		/*
		 * Diagnostic: c_Next is not total garbage.
		 */
		kassert(chunk->c_Next == NULL ||
			((intptr_t)chunk->c_Next & IN_SAME_PAGE_MASK) ==
			((intptr_t)chunk & IN_SAME_PAGE_MASK));
#endif
		z->z_PageAry[z->z_FirstFreePg] = chunk->c_Next;
		return(chunk);
	    }
	    ++z->z_FirstFreePg;
	}
	chunk = (Chunk *)((char *)z + z->z_UAlloc);
	z->z_UAlloc += bytes;
	kassert(z->z_UAlloc < ZONE_SIZE);
	return(chunk);
    }

    /*
     * If all zones are exhausted we need to allocate a new zone for this
     * size.
     */
    {
	int off;

	if ((z = FreeZones) != NULL) {
	    FreeZones = z->z_Next;
	} else {
	    z = sys_valloc_aligned(ZONE_SIZE);
	}
	bzero(z, sizeof(Zone));

	off = (sizeof(Zone) + MIN_CHUNK_MASK) & ~MIN_CHUNK_MASK;
	z->z_Magic = ZONE_MAGIC;
	z->z_ZoneIndex = zi;
	z->z_NMax = (ZONE_SIZE - off) / bytes;
	z->z_NFree = z->z_NMax - 1;
	z->z_UAlloc = off + bytes;
	z->z_ChunkSize = bytes;
	z->z_FirstFreePg = arysize(z->z_PageAry);
	chunk = (Chunk *)((char *)z + off);
	z->z_Next = ZoneAry[zi];
#if defined(STANDALONE)
	printf("Allocating new zone %p chunk %d %d/%d\n", 
	    z, z->z_ChunkSize, z->z_NFree, z->z_NMax);
#endif
	ZoneAry[zi] = z;
    }
    return(chunk);
}

void
slab_free(void *ptr, uintptr_t bytes)
{
    Zone *z;
    Chunk *chunk;
    int pgno;

    /*
     * Degenerate cases
     */
    kassert(bytes > 0);
    if ((bytes & PAGE_MASK) == 0 || bytes > ZONE_ALLOC_LIMIT) {
	sys_vfree(ptr, (bytes + PAGE_MASK) & ~PAGE_MASK);
	return;
    }

    /*
     * Zone case.  Figure out the zone based on the fact that it is
     * ZONE_SIZE aligned.
     */
    z = (Zone *)((intptr_t)ptr & ~(ZONE_SIZE - 1));

    kassert(z->z_Magic == ZONE_MAGIC);
#ifdef DIAGNOSTIC
    /*
     * Diagnostic: chunk size must match
     */
    (void)zoneindex(&bytes);
    kassert(z->z_ChunkSize == bytes);
#endif
    pgno = ((char *)ptr - (char *)z) >> PAGE_SHIFT;
    if (z->z_FirstFreePg > pgno)
	z->z_FirstFreePg = pgno;
    chunk = ptr;

#ifdef DIAGNOSTIC
    /*
     * Diagnostic: attempt to detect a double-free (not perfect).
     */
    if (((intptr_t)chunk->c_Next - (intptr_t)z) >> PAGE_SHIFT == pgno) {
	Chunk *scan;
	for (scan = z->z_PageAry[pgno]; scan; scan = scan->c_Next) {
	    if (scan == chunk)
		kpanic("Double free at %p", chunk);
	}
    }
#endif
    chunk->c_Next = z->z_PageAry[pgno];
    z->z_PageAry[pgno] = chunk;

    /*
     * Bump the number of free chunks.  If it becomes non-zero the zone
     * must be added back onto the appropriate list.
     */
    if (z->z_NFree++ == 0) {
	z->z_Next = ZoneAry[z->z_ZoneIndex];
	ZoneAry[z->z_ZoneIndex] = z;
#if defined(STANDALONE)
	printf("Zone %p no longer fully allocated (%p)\n", z, z->z_Next);
#endif
    }

    /*
     * If the zone becomes totally free, and there are other zones we
     * can allocate from, remove this one from the list.
     */
#if defined(STANDALONE)
    printf("NFree %d/%d\n", z->z_NFree, z->z_NMax);
#endif
    if (z->z_NFree == z->z_NMax && 
	(z->z_Next || ZoneAry[z->z_ZoneIndex] != z)
    ) {
	Zone **pz;

	for (pz = &ZoneAry[z->z_ZoneIndex]; z != *pz; pz = &(*pz)->z_Next)
	    ;
	*pz = z->z_Next;
	z->z_Magic = -1;
	if (NFreeZones == ZONE_RELS_THRESH) {
	    z->z_Next = FreeZones->z_Next;
#if defined(STANDALONE)
	    printf("Destroying idle zone %p\n", FreeZones);
#endif
	    sys_vfree(FreeZones, ZONE_SIZE);
	    FreeZones = z;
	} else {
	    z->z_Next = FreeZones;
	    FreeZones = z;
	    ++NFreeZones;
	}
#if defined(STANDALONE)
	printf("Returning completely free zone %p to freelist\n", z);
#endif
    }
}

#if defined(WITH_LIBC)

static void *
sys_valloc(uintptr_t bytes)
{
    char *ptr;

    ptr = mmap(NULL, bytes, PROT_READ|PROT_WRITE, MAP_ANON, -1, 0);
    if (ptr == MAP_FAILED)
	ptr = NULL;
    return(ptr);
}

static void *
sys_valloc_aligned(uintptr_t bytes)
{
    char *ptr;
    uintptr_t extra;

    ptr = mmap(NULL, bytes, PROT_READ|PROT_WRITE, MAP_ANON, -1, 0);
    if (ptr == MAP_FAILED)
	return(NULL);
    if ((uintptr_t)ptr & (bytes - 1)) {
	munmap(ptr, bytes);
	ptr = mmap(NULL, bytes * 2, PROT_READ|PROT_WRITE, MAP_ANON, -1, 0);
	if (ptr == MAP_FAILED)
	    return(NULL);
	if ((extra = (uintptr_t)ptr & (bytes - 1)) == 0) {
	    munmap(ptr + bytes, bytes);
	} else {
	    extra = bytes - extra;
	    munmap(ptr, extra);
	    ptr += extra;
	    munmap(ptr + bytes, bytes - extra);
	}
    }
    return(ptr);
}

static void
sys_vfree(void *ptr, uintptr_t bytes)
{
    munmap(ptr, bytes);
}

#if defined(STANDALONE)

int
main(int ac, char **av)
{
    char buf[256];
    Zone *z;

    slab_setup();
    printf("> "); 
    fflush(stdout);
    while (fgets(buf, sizeof(buf), stdin) != NULL) {
	intptr_t v1 = 0;
	intptr_t v2 = 0;
	char *ptr;

	if (buf[0]) {
	    ptr = buf + 1;
	    v1 = strtoul(ptr, &ptr, 0);
	    v2 = strtoul(ptr, &ptr, 0);
	}
	switch(buf[0]) {
	case 'a':
	    ptr = slab_alloc(v1);
	    z = (Zone *)((intptr_t)ptr & ~(ZONE_SIZE - 1));
	    printf("allocate %p %d\tZone %p %d/%d\n",
		ptr, v1, z, z->z_NFree, z->z_NMax);
	    break;
	case 'f':
	    slab_free((char *)v1, v2);
	    break;
	case '?':
	case 'h':
	    printf("a <bytes>\tallocate memory\n");
	    printf("f <addr> <bytes>\tfree memory\n");
	    printf("?/h\t\thelp\n");
	    break;
	default:
	    printf("? bad command\n");
	    break;
	}
	printf("> "); 
	fflush(stdout);
    }
    return(0);
}

static void
_kassert(const char *file, int line, const char *cond)
{
    fprintf(stderr, "%s line %d: %s\n", file, line, cond);
    *(int *)0 = 1;
}

static void
kpanic(const char *ctl, ...)
{
    va_list va;

    va_start(va, ctl);
    vfprintf(stderr, ctl, va);
    va_end(va);
    fprintf(stderr, "\n");
    *(int *)0 = 1;
}

#endif

#endif





To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-arch" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200302080822.h188Mi15005276>