From owner-freebsd-bugs@FreeBSD.ORG Wed Jun 23 15:26:40 2004 Return-Path: Delivered-To: freebsd-bugs@freebsd.org Received: from green.homeunix.org (freefall.freebsd.org [216.136.204.21]) by hub.freebsd.org (Postfix) with ESMTP id 289B616A4CE for ; Wed, 23 Jun 2004 15:26:40 +0000 (GMT) Received: from green.homeunix.org (green@localhost [127.0.0.1]) by green.homeunix.org (8.12.11/8.12.11) with ESMTP id i5NFQdnP054449 for ; Wed, 23 Jun 2004 11:26:39 -0400 (EDT) (envelope-from green@green.homeunix.org) Received: (from green@localhost) by green.homeunix.org (8.12.11/8.12.11/Submit) id i5NFQcJr054448 for freebsd-bugs@FreeBSD.org; Wed, 23 Jun 2004 11:26:38 -0400 (EDT) (envelope-from green) Date: Wed, 23 Jun 2004 11:26:38 -0400 From: Brian Fundakowski Feldman To: freebsd-bugs@FreeBSD.org Message-ID: <20040623152638.GC1057@green.homeunix.org> References: <200406230915.i5N9FJLF004193@freefall.freebsd.org> Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <200406230915.i5N9FJLF004193@freefall.freebsd.org> User-Agent: Mutt/1.5.6i Subject: Re: kern/55081: contigmalloc API semantics inadequate --- forces KVM mapping X-BeenThere: freebsd-bugs@freebsd.org X-Mailman-Version: 2.1.1 Precedence: list List-Id: Bug reports List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 23 Jun 2004 15:26:40 -0000 Indeed I have a reimplementation of contigmalloc(9) in the waiting. As you suggested it should be, it is split apart into functions that can be called without mapping the memory into kernel space. However, it's also a reimplementation of the actual acquisition of pages such that it should be far more reliable than before: instead of paging out everything and hoping to get a contiguous region, it pages out a page at a time and should only fail due to running completely out of swap space or having so much wired memory that it fragments the part of the physical address pspace you want to use. Please try this patch and see how well it works for you. You do not have to change vm.old_contigmalloc to be able to use the new vm_page_alloc_contig() and vm_page_release_contig() functions. You should also notice the allocated memory show up in vmstat -m output for calls to contigmalloc(); you can report your driver's memory to these pools, too, using the malloc_type_*() functions. Index: sys/malloc.h =================================================================== RCS file: /usr/ncvs/src/sys/sys/malloc.h,v retrieving revision 1.76 diff -u -r1.76 malloc.h --- sys/malloc.h 7 Apr 2004 04:19:49 -0000 1.76 +++ sys/malloc.h 19 Jun 2004 05:20:30 -0000 @@ -105,6 +105,8 @@ void *malloc(unsigned long size, struct malloc_type *type, int flags); void malloc_init(void *); int malloc_last_fail(void); +void malloc_type_allocated(struct malloc_type *type, unsigned long size); +void malloc_type_freed(struct malloc_type *type, unsigned long size); void malloc_uninit(void *); void *realloc(void *addr, unsigned long size, struct malloc_type *type, int flags); Index: kern/kern_malloc.c =================================================================== RCS file: /usr/ncvs/src/sys/kern/kern_malloc.c,v retrieving revision 1.133 diff -u -r1.133 kern_malloc.c --- kern/kern_malloc.c 31 May 2004 21:46:04 -0000 1.133 +++ kern/kern_malloc.c 19 Jun 2004 05:22:40 -0000 @@ -175,6 +175,47 @@ } /* + * Add this to the informational malloc_type bucket. + */ +static void +malloc_type_zone_allocated(struct malloc_type *ksp, unsigned long size, + int zindx) +{ + mtx_lock(&ksp->ks_mtx); + ksp->ks_calls++; + if (zindx != -1) + ksp->ks_size |= 1 << zindx; + if (size != 0) { + ksp->ks_memuse += size; + ksp->ks_inuse++; + if (ksp->ks_memuse > ksp->ks_maxused) + ksp->ks_maxused = ksp->ks_memuse; + } + mtx_unlock(&ksp->ks_mtx); +} + +void +malloc_type_allocated(struct malloc_type *ksp, unsigned long size) +{ + malloc_type_zone_allocated(ksp, size, -1); +} + +/* + * Remove this allocation from the informational malloc_type bucket. + */ +void +malloc_type_freed(struct malloc_type *ksp, unsigned long size) +{ + mtx_lock(&ksp->ks_mtx); + KASSERT(size <= ksp->ks_memuse, + ("malloc(9)/free(9) confusion.\n%s", + "Probably freeing with wrong type, but maybe not here.")); + ksp->ks_memuse -= size; + ksp->ks_inuse--; + mtx_unlock(&ksp->ks_mtx); +} + +/* * malloc: * * Allocate a block of memory. @@ -195,7 +236,6 @@ #ifdef DIAGNOSTIC unsigned long osize = size; #endif - register struct malloc_type *ksp = type; #ifdef INVARIANTS /* @@ -241,29 +281,16 @@ krequests[size >> KMEM_ZSHIFT]++; #endif va = uma_zalloc(zone, flags); - mtx_lock(&ksp->ks_mtx); - if (va == NULL) - goto out; - - ksp->ks_size |= 1 << indx; - size = keg->uk_size; + if (va != NULL) + size = keg->uk_size; + malloc_type_zone_allocated(type, va == NULL ? 0 : size, indx); } else { size = roundup(size, PAGE_SIZE); zone = NULL; keg = NULL; va = uma_large_malloc(size, flags); - mtx_lock(&ksp->ks_mtx); - if (va == NULL) - goto out; + malloc_type_allocated(type, va == NULL ? 0 : size); } - ksp->ks_memuse += size; - ksp->ks_inuse++; -out: - ksp->ks_calls++; - if (ksp->ks_memuse > ksp->ks_maxused) - ksp->ks_maxused = ksp->ks_memuse; - - mtx_unlock(&ksp->ks_mtx); if (flags & M_WAITOK) KASSERT(va != NULL, ("malloc(M_WAITOK) returned NULL")); else if (va == NULL) @@ -288,7 +315,6 @@ void *addr; struct malloc_type *type; { - register struct malloc_type *ksp = type; uma_slab_t slab; u_long size; @@ -296,7 +322,7 @@ if (addr == NULL) return; - KASSERT(ksp->ks_memuse > 0, + KASSERT(type->ks_memuse > 0, ("malloc(9)/free(9) confusion.\n%s", "Probably freeing with wrong type, but maybe not here.")); size = 0; @@ -333,13 +359,7 @@ size = slab->us_size; uma_large_free(slab); } - mtx_lock(&ksp->ks_mtx); - KASSERT(size <= ksp->ks_memuse, - ("malloc(9)/free(9) confusion.\n%s", - "Probably freeing with wrong type, but maybe not here.")); - ksp->ks_memuse -= size; - ksp->ks_inuse--; - mtx_unlock(&ksp->ks_mtx); + malloc_type_freed(type, size); } /* Index: vm/vm_page.h =================================================================== RCS file: /usr/ncvs/src/sys/vm/vm_page.h,v retrieving revision 1.131 diff -u -r1.131 vm_page.h --- vm/vm_page.h 5 Jun 2004 21:06:42 -0000 1.131 +++ vm/vm_page.h 19 Jun 2004 13:55:08 -0000 @@ -342,6 +342,9 @@ void vm_page_activate (vm_page_t); vm_page_t vm_page_alloc (vm_object_t, vm_pindex_t, int); +vm_page_t vm_page_alloc_contig (vm_pindex_t, vm_paddr_t, vm_paddr_t, + vm_offset_t, vm_offset_t); +void vm_page_release_contig (vm_page_t, vm_pindex_t); vm_page_t vm_page_grab (vm_object_t, vm_pindex_t, int); void vm_page_cache (register vm_page_t); int vm_page_try_to_cache (vm_page_t); Index: vm/vm_contig.c =================================================================== RCS file: /usr/ncvs/src/sys/vm/vm_contig.c,v retrieving revision 1.35 diff -u -r1.35 vm_contig.c --- vm/vm_contig.c 15 Jun 2004 01:02:00 -0000 1.35 +++ vm/vm_contig.c 19 Jun 2004 13:56:18 -0000 @@ -68,6 +68,9 @@ #include #include #include +#include +#include +#include #include #include @@ -83,49 +86,63 @@ #include static int -vm_contig_launder(int queue) +vm_contig_launder_page(vm_page_t m) { vm_object_t object; - vm_page_t m, m_tmp, next; + vm_page_t m_tmp; struct vnode *vp; + if (!VM_OBJECT_TRYLOCK(m->object)) + return (EAGAIN); + if (vm_page_sleep_if_busy(m, TRUE, "vpctw0")) { + VM_OBJECT_UNLOCK(m->object); + vm_page_lock_queues(); + return (EBUSY); + } + vm_page_test_dirty(m); + if (m->dirty == 0 && m->busy == 0 && m->hold_count == 0) + pmap_remove_all(m); + if (m->dirty) { + object = m->object; + if (object->type == OBJT_VNODE) { + vm_page_unlock_queues(); + vp = object->handle; + VM_OBJECT_UNLOCK(object); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); + VM_OBJECT_LOCK(object); + vm_object_page_clean(object, 0, 0, OBJPC_SYNC); + VM_OBJECT_UNLOCK(object); + VOP_UNLOCK(vp, 0, curthread); + vm_page_lock_queues(); + return (0); + } else if (object->type == OBJT_SWAP || + object->type == OBJT_DEFAULT) { + m_tmp = m; + vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC); + VM_OBJECT_UNLOCK(object); + return (0); + } + } else if (m->busy == 0 && m->hold_count == 0) + vm_page_cache(m); + VM_OBJECT_UNLOCK(m->object); + return (0); +} + +static int +vm_contig_launder(int queue) +{ + vm_page_t m, next; + int error; + for (m = TAILQ_FIRST(&vm_page_queues[queue].pl); m != NULL; m = next) { next = TAILQ_NEXT(m, pageq); KASSERT(m->queue == queue, ("vm_contig_launder: page %p's queue is not %d", m, queue)); - if (!VM_OBJECT_TRYLOCK(m->object)) - continue; - if (vm_page_sleep_if_busy(m, TRUE, "vpctw0")) { - VM_OBJECT_UNLOCK(m->object); - vm_page_lock_queues(); + error = vm_contig_launder_page(m); + if (error == 0) return (TRUE); - } - vm_page_test_dirty(m); - if (m->dirty == 0 && m->busy == 0 && m->hold_count == 0) - pmap_remove_all(m); - if (m->dirty) { - object = m->object; - if (object->type == OBJT_VNODE) { - vm_page_unlock_queues(); - vp = object->handle; - VM_OBJECT_UNLOCK(object); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, curthread); - VM_OBJECT_LOCK(object); - vm_object_page_clean(object, 0, 0, OBJPC_SYNC); - VM_OBJECT_UNLOCK(object); - VOP_UNLOCK(vp, 0, curthread); - vm_page_lock_queues(); - return (TRUE); - } else if (object->type == OBJT_SWAP || - object->type == OBJT_DEFAULT) { - m_tmp = m; - vm_pageout_flush(&m_tmp, 1, VM_PAGER_PUT_SYNC); - VM_OBJECT_UNLOCK(object); - return (TRUE); - } - } else if (m->busy == 0 && m->hold_count == 0) - vm_page_cache(m); - VM_OBJECT_UNLOCK(m->object); + if (error == EBUSY) + return (FALSE); } return (FALSE); } @@ -308,6 +325,193 @@ return (NULL); } +static void +vm_page_release_contigl(vm_page_t m, vm_pindex_t count) +{ + mtx_lock_spin(&vm_page_queue_free_mtx); + while (count--) { + vm_pageq_enqueue(PQ_FREE + m->pc, m); + m++; + } + mtx_unlock_spin(&vm_page_queue_free_mtx); +} + +void +vm_page_release_contig(vm_page_t m, vm_pindex_t count) +{ + vm_page_lock_queues(); + vm_page_release_contigl(m, count); + vm_page_unlock_queues(); +} + +static void +vm_contig_unqueue_free(vm_page_t m) +{ + + KASSERT((m->queue - m->pc) == PQ_FREE, + ("contigmalloc2: page %p not freed", m)); + mtx_lock_spin(&vm_page_queue_free_mtx); + vm_pageq_remove_nowakeup(m); + mtx_unlock_spin(&vm_page_queue_free_mtx); + m->valid = VM_PAGE_BITS_ALL; + if (m->flags & PG_ZERO) + vm_page_zero_count--; + /* Don't clear the PG_ZERO flag; we'll need it later. */ + m->flags = PG_UNMANAGED | (m->flags & PG_ZERO); + KASSERT(m->dirty == 0, + ("contigmalloc2: page %p was dirty", m)); + m->wire_count = 0; + m->busy = 0; + m->object = NULL; +} + +vm_page_t +vm_page_alloc_contig(vm_pindex_t npages, vm_paddr_t low, vm_paddr_t high, + vm_offset_t alignment, vm_offset_t boundary) +{ + vm_object_t object; + vm_offset_t size; + vm_paddr_t phys; + vm_page_t pga = vm_page_array; + int i, pass, pqtype, start; + + size = npages << PAGE_SHIFT; + if (size == 0) + panic("vm_page_alloc_contig: size must not be 0"); + if ((alignment & (alignment - 1)) != 0) + panic("vm_page_alloc_contig: alignment must be a power of 2"); + if ((boundary & (boundary - 1)) != 0) + panic("vm_page_alloc_contig: boundary must be a power of 2"); + + for (pass = 0; pass < 2; pass++) { + start = cnt.v_page_count; + vm_page_lock_queues(); +retry: + start--; + /* + * Find last page in array that is free, within range, + * aligned, and such that the boundary won't be crossed. + */ + for (i = start; i >= 0; i--) { + phys = VM_PAGE_TO_PHYS(&pga[i]); + pqtype = pga[i].queue - pga[i].pc; + if (pass == 0) { + if (pqtype != PQ_FREE && pqtype != PQ_CACHE) + continue; + } else if (pqtype != PQ_FREE && pqtype != PQ_CACHE && + pga[i].queue != PQ_ACTIVE && + pga[i].queue != PQ_INACTIVE) + continue; + if (phys >= low && phys + size <= high && + ((phys & (alignment - 1)) == 0) && + ((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0) + break; + } + /* There are no candidates at all. */ + if (i == -1) { + vm_page_unlock_queues(); + continue; + } + start = i; + /* + * Check successive pages for contiguous and free. + */ + for (i = start + 1; i < start + npages; i++) { + pqtype = pga[i].queue - pga[i].pc; + if (VM_PAGE_TO_PHYS(&pga[i]) != + VM_PAGE_TO_PHYS(&pga[i - 1]) + PAGE_SIZE) + goto retry; + if (pass == 0) { + if (pqtype != PQ_FREE && pqtype != PQ_CACHE) + goto retry; + } else if (pqtype != PQ_FREE && pqtype != PQ_CACHE && + pga[i].queue != PQ_ACTIVE && + pga[i].queue != PQ_INACTIVE) + goto retry; + } + for (i = start; i < start + npages; i++) { + vm_page_t m = &pga[i]; + + pqtype = m->queue - m->pc; + if (pass != 0 && pqtype != PQ_FREE && + pqtype != PQ_CACHE) { + switch (m->queue) { + case PQ_ACTIVE: + case PQ_INACTIVE: + if (vm_contig_launder_page(m) != 0) + goto cleanup_freed; + pqtype = m->queue - m->pc; + if (pqtype == PQ_FREE || + pqtype == PQ_CACHE) + break; + default: +cleanup_freed: + vm_page_release_contigl(&pga[start], + i - start); + goto retry; + } + } + if (pqtype == PQ_CACHE) { + object = m->object; + if (!VM_OBJECT_TRYLOCK(object)) + goto retry; + vm_page_busy(m); + vm_page_free(m); + VM_OBJECT_UNLOCK(object); + } + vm_contig_unqueue_free(m); + } + vm_page_unlock_queues(); + /* + * We've found a contiguous chunk that meets are requirements. + */ + return (&pga[start]); + } + return (NULL); +} + +static void * +contigmalloc2(vm_page_t m, vm_pindex_t npages, int flags) +{ + vm_object_t object = kernel_object; + vm_map_t map = kernel_map; + vm_offset_t addr, tmp_addr; + vm_pindex_t i; + + /* + * Allocate kernel VM, unfree and assign the physical pages to + * it and return kernel VM pointer. + */ + vm_map_lock(map); + if (vm_map_findspace(map, vm_map_min(map), npages << PAGE_SHIFT, &addr) + != KERN_SUCCESS) { + vm_map_unlock(map); + return (NULL); + } + vm_object_reference(object); + vm_map_insert(map, object, addr - VM_MIN_KERNEL_ADDRESS, + addr, addr + (npages << PAGE_SHIFT), VM_PROT_ALL, VM_PROT_ALL, 0); + vm_map_unlock(map); + tmp_addr = addr; + VM_OBJECT_LOCK(object); + for (i = 0; i < npages; i++) { + vm_page_insert(&m[i], object, + OFF_TO_IDX(tmp_addr - VM_MIN_KERNEL_ADDRESS)); + if ((flags & M_ZERO) && !(m->flags & PG_ZERO)) + pmap_zero_page(&m[i]); + tmp_addr += PAGE_SIZE; + } + VM_OBJECT_UNLOCK(object); + vm_map_wire(map, addr, addr + (npages << PAGE_SHIFT), + VM_MAP_WIRE_SYSTEM | VM_MAP_WIRE_NOHOLES); + return ((void *)addr); +} + +static int vm_old_contigmalloc = 1; +SYSCTL_INT(_vm, OID_AUTO, old_contigmalloc, + CTLFLAG_RW, &vm_old_contigmalloc, 0, "Use the old contigmalloc algorithm"); +TUNABLE_INT("vm.old_contigmalloc", &vm_old_contigmalloc); + void * contigmalloc( unsigned long size, /* should be size_t here and for malloc() */ @@ -319,17 +523,37 @@ unsigned long boundary) { void * ret; + vm_page_t pages; + vm_pindex_t npgs; + npgs = round_page(size) >> PAGE_SHIFT; mtx_lock(&Giant); - ret = contigmalloc1(size, type, flags, low, high, alignment, boundary, - kernel_map); + if (vm_old_contigmalloc) { + ret = contigmalloc1(size, type, flags, low, high, alignment, + boundary, kernel_map); + } else { + pages = vm_page_alloc_contig(npgs, low, high, + alignment, boundary); + if (pages == NULL) { + ret = NULL; + } else { + ret = contigmalloc2(pages, npgs, flags); + if (ret == NULL) + vm_page_release_contig(pages, npgs); + } + + } mtx_unlock(&Giant); + malloc_type_allocated(type, ret == NULL ? 0 : npgs << PAGE_SHIFT); return (ret); } void contigfree(void *addr, unsigned long size, struct malloc_type *type) { + vm_pindex_t npgs; + npgs = round_page(size) >> PAGE_SHIFT; kmem_free(kernel_map, (vm_offset_t)addr, size); + malloc_type_freed(type, npgs << PAGE_SHIFT); } -- Brian Fundakowski Feldman \'[ FreeBSD ]''''''''''\ <> green@FreeBSD.org \ The Power to Serve! \ Opinions expressed are my own. \,,,,,,,,,,,,,,,,,,,,,,\