Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 04 Sep 2012 10:58:17 -0600
From:      Ian Lepore <freebsd@damnhippie.dyndns.org>
To:        freebsd-arm@freebsd.org
Cc:        freebsd-mips@freebsd.org
Subject:   busdma buffer management enhancements - call for review and test
Message-ID:  <1346777897.1140.633.camel@revolution.hippie.lan>

next in thread | raw e-mail | index | archive | help

--=-nhYas3YqMWIkbv0iBerz
Content-Type: text/plain; charset="us-ascii"
Content-Transfer-Encoding: 7bit

The attached set of patches enhances the ARM v4/v5 busdma management of
small DMA buffers, especially for BUS_DMA_COHERENT and uncacheable
memory.  The existing implementation uses malloc(9) to allocate buffers,
then uses arm_remap_nocache() to make the page(s) containing the buffers
uncacheable, even when those pages contain data other than DMA buffers.

The attached patches address this by:

      * Adding support for pmap_page_set_memattr() and
        VM_MEMATTR_UNCACHEABLE to the ARM v4 pmap implemenation.
      * Adding new common code usable by any platform that uses uma(9)
        to manage pools of power-of-two sized buffers, keeping them
        sized and aligned as required to help maintain cache coherency.
      * Modifying the busdma implementation to use the new pool manager
        to manage pools of both regular and uncacheable buffers, and
        also to use uma(9) to manage a pool of map structures.
      * Using knowledge of the alignment and padding of pool-allocated
        buffers to avoid doing partial cache line flushes on those
        buffers.

I'm CC'ing freebsd-mips because I think these patches can be adapted to
the MIPS busdma implementation as well.  The new busdma_bufalloc.[ch]
code isn't arm-specific, and shouldn't live in sys/arm/arm/, but I don't
know where it should go.

Once we have ARM and MIPS able to efficiently manage small cache-aligned
DMA buffers, the stage is set to begin updating device drivers to
allocate buffers individually, rather than allocating huge chunks and
sub-dividing them, possibly violating cache line boundaries in doing so.

I've been running these for a couple days on DreamPlug and Atmel
hardware (the latter in an 8.x environment) without problems.

-- Ian

--=-nhYas3YqMWIkbv0iBerz
Content-Disposition: attachment; filename="arm_pmap_page_set_memattr.diff"
Content-Type: text/x-patch; name="arm_pmap_page_set_memattr.diff";
	charset="us-ascii"
Content-Transfer-Encoding: 7bit

# HG changeset patch
# User Ian Lepore <freebsd@damnhippie.dyndns.org>
# Date 1346609105 21600
# Node ID 0d90c8ad4bb328c5b9c6e9a515f571b1a71e9568
# Parent  0599cbc079c9fbd7ed53b525fafff622b939a318
Add ARM v4/v5 support for pmap_page_set_memattr() and one new attribute, 
VM_MEMATTR_UNCACHEABLE.  This helps pave the way for changes in the ARM 
busdma implementation, including the ability to use kmem_alloc_attr() to 
allocate large regions of uncached memory which aren't necessarily 
contiguous, and to use uma(9) to efficiently sub-allocate uncached pages 
for small IO buffers.  

Right now, UNCACHEABLE disables both cache and write buffering on the page.
In the future we may want more fine-grained control with separate defines,
for cache and buffering, and also the armv6+ world may have additional
attributes (such as SHARED).

diff -r 0599cbc079c9 -r 0d90c8ad4bb3 sys/arm/arm/pmap.c
--- sys/arm/arm/pmap.c	Sun Sep 02 11:44:13 2012 -0600
+++ sys/arm/arm/pmap.c	Sun Sep 02 12:05:05 2012 -0600
@@ -1392,7 +1392,8 @@ pmap_fix_cache(struct vm_page *pg, pmap_
 		    (pv->pv_flags & PVF_NC)) {
 
 			pv->pv_flags &= ~PVF_NC;
-			pmap_set_cache_entry(pv, pm, va, 1);
+			if (!(pg->md.pv_memattr & VM_MEMATTR_UNCACHEABLE))
+				pmap_set_cache_entry(pv, pm, va, 1);
 			continue;
 		}
 			/* user is no longer sharable and writable */
@@ -1401,7 +1402,8 @@ pmap_fix_cache(struct vm_page *pg, pmap_
 		    !pmwc && (pv->pv_flags & PVF_NC)) {
 
 			pv->pv_flags &= ~(PVF_NC | PVF_MWC);
-			pmap_set_cache_entry(pv, pm, va, 1);
+			if (!(pg->md.pv_memattr & VM_MEMATTR_UNCACHEABLE))
+				pmap_set_cache_entry(pv, pm, va, 1);
 		}
 	}
 
@@ -1452,15 +1454,16 @@ pmap_clearbit(struct vm_page *pg, u_int 
 
 		if (!(oflags & maskbits)) {
 			if ((maskbits & PVF_WRITE) && (pv->pv_flags & PVF_NC)) {
-				/* It is safe to re-enable cacheing here. */
-				PMAP_LOCK(pm);
-				l2b = pmap_get_l2_bucket(pm, va);
-				ptep = &l2b->l2b_kva[l2pte_index(va)];
-				*ptep |= pte_l2_s_cache_mode;
-				PTE_SYNC(ptep);
-				PMAP_UNLOCK(pm);
+				if (!(pg->md.pv_memattr & 
+				    VM_MEMATTR_UNCACHEABLE)) {
+					PMAP_LOCK(pm);
+					l2b = pmap_get_l2_bucket(pm, va);
+					ptep = &l2b->l2b_kva[l2pte_index(va)];
+					*ptep |= pte_l2_s_cache_mode;
+					PTE_SYNC(ptep);
+					PMAP_UNLOCK(pm);
+				}
 				pv->pv_flags &= ~(PVF_NC | PVF_MWC);
-				
 			}
 			continue;
 		}
@@ -1489,7 +1492,9 @@ pmap_clearbit(struct vm_page *pg, u_int 
 				 * permission.
 				 */
 				if (maskbits & PVF_WRITE) {
-					npte |= pte_l2_s_cache_mode;
+					if (!(pg->md.pv_memattr & 
+					    VM_MEMATTR_UNCACHEABLE))
+						npte |= pte_l2_s_cache_mode;
 					pv->pv_flags &= ~(PVF_NC | PVF_MWC);
 				}
 			} else
@@ -1830,6 +1835,7 @@ pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
+	m->md.pv_memattr = VM_MEMATTR_DEFAULT;
 }
 
 /*
@@ -3427,7 +3433,8 @@ do_l2b_alloc:
 		    (m->oflags & VPO_UNMANAGED) == 0)
 			vm_page_aflag_set(m, PGA_WRITEABLE);
 	}
-	npte |= pte_l2_s_cache_mode;
+	if (!(m->md.pv_memattr & VM_MEMATTR_UNCACHEABLE))
+		npte |= pte_l2_s_cache_mode;
 	if (m && m == opg) {
 		/*
 		 * We're changing the attrs of an existing mapping.
@@ -4963,3 +4970,24 @@ pmap_devmap_find_va(vm_offset_t va, vm_s
 	return (NULL);
 }
 
+void
+pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
+{
+	/* 
+	 * Remember the memattr in a field that gets used to set the appropriate
+	 * bits in the PTEs as mappings are established.
+	 */
+	m->md.pv_memattr = ma;
+
+	/*
+	 * It appears that this function can only be called before any mappings
+	 * for the page are established on ARM.  If this ever changes, this code
+	 * will need to walk the pv_list and make each of the existing mappings
+	 * uncacheable, being careful to sync caches and PTEs (and maybe
+	 * invalidate TLB?) for any current mapping it modifies.
+	 */
+	if (m->md.pv_kva != 0 || TAILQ_FIRST(&m->md.pv_list) != NULL)
+		panic("Can't change memattr on page with existing mappings");
+}
+
+
diff -r 0599cbc079c9 -r 0d90c8ad4bb3 sys/arm/include/pmap.h
--- sys/arm/include/pmap.h	Sun Sep 02 11:44:13 2012 -0600
+++ sys/arm/include/pmap.h	Sun Sep 02 12:05:05 2012 -0600
@@ -97,10 +97,10 @@ enum mem_type {
 
 #endif
 
-#define	pmap_page_get_memattr(m)	VM_MEMATTR_DEFAULT
+#define	pmap_page_get_memattr(m)	((m)->md.pv_memattr)
 #define	pmap_page_is_mapped(m)	(!TAILQ_EMPTY(&(m)->md.pv_list))
 #define	pmap_page_is_write_mapped(m)	(((m)->aflags & PGA_WRITEABLE) != 0)
-#define	pmap_page_set_memattr(m, ma)	(void)0
+void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma);
 
 /*
  * Pmap stuff
@@ -120,6 +120,7 @@ struct	pv_entry;
 
 struct	md_page {
 	int pvh_attrs;
+	vm_memattr_t	 pv_memattr;
 	vm_offset_t pv_kva;		/* first kernel VA mapping */
 	TAILQ_HEAD(,pv_entry)	pv_list;
 };
diff -r 0599cbc079c9 -r 0d90c8ad4bb3 sys/arm/include/vm.h
--- sys/arm/include/vm.h	Sun Sep 02 11:44:13 2012 -0600
+++ sys/arm/include/vm.h	Sun Sep 02 12:05:05 2012 -0600
@@ -29,7 +29,8 @@
 #ifndef _MACHINE_VM_H_
 #define	_MACHINE_VM_H_
 
-/* Memory attribute configuration is not (yet) implemented. */
+/* Memory attribute configuration. */
 #define	VM_MEMATTR_DEFAULT	0
+#define	VM_MEMATTR_UNCACHEABLE	1
 
 #endif /* !_MACHINE_VM_H_ */

--=-nhYas3YqMWIkbv0iBerz
Content-Disposition: attachment; filename="arm_busdma_bufalloc.diff"
Content-Type: text/x-patch; name="arm_busdma_bufalloc.diff"; charset="us-ascii"
Content-Transfer-Encoding: 7bit

# HG changeset patch
# User Ian Lepore <freebsd@damnhippie.dyndns.org>
# Date 1346768587 21600
# Node ID a84eaa62a91f0993e70448863192b7a057c227b8
# Parent  c327e2b8786cd9f714d9f9df42961a76c0ab0f16
Create an architecture-agnostic buffer pool manager that uses uma(9) to 
manage a set of power-of-2 sized buffers for bus_dmamem_alloc().  

This allows the caller to provide the back-end allocator uma allocator, 
allowing full control of the memory pages backing the pool.  For 
convenience, it provides an optional builtin allocator that provides pages 
allocated with the VM_MEMATTR_UNCACHEABLE attribute, for managing pools of 
DMA buffers for BUS_DMA_COHERENT or BUS_DMA_NOCACHE.  

This also allows the caller to specify a minimum alignment, and it ensures 
that all buffers start on a boundary and have a length that's a multiple of 
that value, to avoid using buffers that trigger partial cache line flushes.  

Since this is architecture-agnostic, it shouldn't live in arm/arm, but
I'm not sure what the right place is.

diff -r c327e2b8786c -r a84eaa62a91f sys/arm/arm/busdma_bufalloc.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ sys/arm/arm/busdma_bufalloc.c	Tue Sep 04 08:23:07 2012 -0600
@@ -0,0 +1,177 @@
+/*-
+ * Copyright (c) 2012 Ian Lepore
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: $");
+
+/*
+ * Buffer allocation support routines for bus_dmamem_alloc implementations.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/malloc.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/uma.h>
+
+#include <machine/busdma_bufalloc.h>
+
+/*
+ * We manage buffer zones up to a page in size.  Buffers larger than a page can
+ * be managed by one of the kernel's page-oriented memory allocation routines as
+ * efficiently as what we can do here.  Also, a page is the largest size for
+ * which we can g'tee contiguity when using uma, and contiguity is one of the
+ * requirements we have to fulfill.
+ */
+#define	MIN_ZONE_BUFSIZE	32
+#define	MAX_ZONE_BUFSIZE	PAGE_SIZE
+
+/*
+ * The static array of 12 bufzones is big enough to handle all the zones for the
+ * smallest supported allocation size of 32 through the largest supported page
+ * size of 64K.  If you up the biggest page size number, up the array size too.
+ * Basically the size of the array needs to be log2(maxsize)-log2(minsize)+1,
+ * but I don't know of an easy way to express that as a compile-time constant.
+ */
+#if PAGE_SIZE > 65536
+#error Unsupported page size
+#endif
+
+struct busdma_bufalloc {
+	bus_size_t		min_size;
+	size_t			num_zones;
+	struct busdma_bufzone	buf_zones[12];
+};
+
+busdma_bufalloc_t 
+busdma_bufalloc_create(const char *name, bus_size_t minimum_alignment,
+    uma_alloc alloc_func, uma_free free_func, u_int32_t zcreate_flags)
+{
+	struct busdma_bufalloc *ba;
+	struct busdma_bufzone *bz;
+	int i;
+	bus_size_t cursize;
+
+	ba = malloc(sizeof(struct busdma_bufalloc), M_DEVBUF, 
+	    M_ZERO | M_WAITOK);
+	if (ba == NULL)
+		panic("Cannot allocate busdma_bufalloc for %s", name);
+
+	ba->min_size = MAX(MIN_ZONE_BUFSIZE, minimum_alignment);
+
+	/*
+	 * Each uma zone is created with an alignment of size-1, meaning that
+	 * the alignment is equal to the size (I.E., 64 byte buffers are aligned
+	 * to 64 byte boundaries, etc).  This allows for a fast efficient test
+	 * when deciding whether a pool buffer meets the constraints of a given
+	 * tag used for allocation: the buffer is usable if tag->alignment <=
+	 * bufzone->size.
+	 */
+	for (i = 0, bz = ba->buf_zones, cursize = ba->min_size;
+	    i < nitems(ba->buf_zones) && cursize <= MAX_ZONE_BUFSIZE;
+	    ++i, ++bz, cursize <<= 1) {
+		snprintf(bz->name, sizeof(bz->name), "dma %.10s %lu",
+		    name, cursize);
+		bz->size = cursize;
+		bz->umazone = uma_zcreate(bz->name, bz->size,
+		    NULL, NULL, NULL, NULL, bz->size - 1, zcreate_flags);
+		if (bz->umazone == NULL) {
+			busdma_bufalloc_destroy(ba);
+			return (NULL);
+		}
+		if (alloc_func != NULL)
+			uma_zone_set_allocf(bz->umazone, alloc_func);
+		if (free_func != NULL)
+			uma_zone_set_freef(bz->umazone, free_func);
+		++ba->num_zones;
+	}
+
+	return (ba);
+}
+
+void 
+busdma_bufalloc_destroy(busdma_bufalloc_t ba)
+{
+	struct busdma_bufzone *bz;
+	int i;
+
+	if (ba == NULL)
+		return;
+
+	for (i = 0, bz = ba->buf_zones; i < ba->num_zones; ++i, ++bz) {
+		uma_zdestroy(bz->umazone);
+	}
+
+	free(ba, M_DEVBUF);
+}
+
+struct busdma_bufzone * 
+busdma_bufalloc_findzone(busdma_bufalloc_t ba, bus_size_t size)
+{
+	struct busdma_bufzone *bz;
+	int i;
+
+	if (size > MAX_ZONE_BUFSIZE)
+		return (NULL);
+
+	for (i = 0, bz = ba->buf_zones; i < ba->num_zones; ++i, ++bz) {
+		if (bz->size >= size)
+			return (bz);
+	}
+
+	panic("Didn't find a buffer zone of the right size");
+}
+
+void *
+busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, int size, u_int8_t *pflag,
+    int wait)
+{
+#ifdef VM_MEMATTR_UNCACHEABLE
+
+	/* Inform UMA that this allocator uses kernel_map/object. */
+	*pflag = UMA_SLAB_KERNEL;
+
+	return ((void *)kmem_alloc_attr(kernel_map, size, wait, 0,
+	    BUS_SPACE_MAXADDR, VM_MEMATTR_UNCACHEABLE));
+
+#else
+
+	panic("VM_MEMATTR_UNCACHEABLE unavailable");
+
+#endif	/* VM_MEMATTR_UNCACHEABLE */
+}
+
+void 
+busdma_bufalloc_free_uncacheable(void *item, int size, u_int8_t pflag)
+{
+
+	kmem_free(kernel_map, (vm_offset_t)item, size);
+}
+
diff -r c327e2b8786c -r a84eaa62a91f sys/arm/include/busdma_bufalloc.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ sys/arm/include/busdma_bufalloc.h	Tue Sep 04 08:23:07 2012 -0600
@@ -0,0 +1,114 @@
+/*-
+ * Copyright (c) 2012 Ian Lepore
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * A buffer pool manager, for use by a platform's busdma implementation.
+ */
+
+#ifndef _MACHINE_BUSDMA_BUFALLOC_H_
+#define _MACHINE_BUSDMA_BUFALLOC_H_
+
+#include <machine/bus.h>
+#include <vm/uma.h>
+
+/*
+ * Information about a buffer zone, returned by busdma_bufalloc_findzone().
+ */
+struct busdma_bufzone {
+	bus_size_t	size;
+	uma_zone_t	umazone;
+	char		name[24];
+};
+
+/*
+ * Opaque handle type returned by busdma_bufalloc_create().
+ */
+struct busdma_bufalloc;
+typedef struct busdma_bufalloc *busdma_bufalloc_t;
+
+/*
+ * Create an allocator that manages a pool of DMA buffers.
+ *
+ * The allocator manages a collection of uma(9) zones of buffers in power-of-two
+ * sized increments ranging from minimum_alignment to the platform's PAGE_SIZE.
+ * The buffers within each zone are aligned on boundaries corresponding to the
+ * buffer size, and thus by implication each buffer is contiguous within a page
+ * and does not cross a power of two boundary larger than the buffer size.
+ * These rules are intended to make it easy for a busdma implementation to
+ * check whether a tag's constraints allow use of a buffer from the allocator.
+ *
+ * minimum_alignment is also the minimum buffer allocation size.  For platforms
+ * with software-assisted cache coherency, this is typically the data cache line
+ * size (and MUST not be smaller than the cache line size).
+ *
+ * name appears in zone stats as 'dma name nnnnn' where 'dma' is fixed and
+ * 'nnnnn' is the size of buffers in that zone.
+ *
+ * If if the alloc/free function pointers are NULL, the regular uma internal
+ * allocators are used (I.E., you get "plain old kernel memory").  On a platform
+ * with an exclusion zone that applies to all DMA operations, a custom allocator
+ * could be used to ensure no buffer memory is ever allocated from that zone,
+ * allowing the bus_dmamem_alloc() implementation to make the assumption that
+ * buffers provided by the allocation could never lead to the need for a bounce.
+ */
+busdma_bufalloc_t busdma_bufalloc_create(const char *name,
+    bus_size_t minimum_alignment,
+    uma_alloc uma_alloc_func, uma_free uma_free_func,
+    u_int32_t uma_zcreate_flags);
+
+/*
+ * Destroy an allocator created by busdma_bufalloc_create().
+ * Safe to call with a NULL pointer.
+ */
+void busdma_bufalloc_destroy(busdma_bufalloc_t ba);
+
+/*
+ * Return a pointer to the busdma_bufzone that should be used to allocate or
+ * free a buffer of the given size.  Returns NULL if the size is larger than the
+ * largest zone handled by the allocator.
+ */
+struct busdma_bufzone * busdma_bufalloc_findzone(busdma_bufalloc_t ba,
+    bus_size_t size);
+
+/*
+ * These built-in allocation routines are available for managing a pools of
+ * uncacheable memory on platforms that support VM_MEMATTR_UNCACHEABLE.
+ *
+ * Allocation is done using kmem_alloc_attr() with these parameters:
+ *   lowaddr  = 0
+ *   highaddr = BUS_SPACE_MAXADDR
+ *   memattr  = VM_MEMATTR_UNCACHEABLE.
+ *
+ * If your platform has no exclusion region (lowaddr/highaddr), and its pmap
+ * routines support pmap_page_set_memattr() and the VM_MEMATTR_UNCACHEABLE flag
+ * you can probably use these when you need uncacheable buffers.
+ */
+void * busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, int size, 
+    u_int8_t *pflag, int wait);
+void  busdma_bufalloc_free_uncacheable(void *item, int size, u_int8_t pflag);
+
+#endif	/* _MACHINE_BUSDMA_BUFALLOC_H_ */
+
diff -r c327e2b8786c -r a84eaa62a91f sys/conf/files.arm
--- sys/conf/files.arm	Sun Sep 02 12:05:05 2012 -0600
+++ sys/conf/files.arm	Tue Sep 04 08:23:07 2012 -0600
@@ -12,6 +12,7 @@ arm/arm/bcopyinout.S		standard
 arm/arm/blockio.S		standard
 arm/arm/bootconfig.c		standard
 arm/arm/bus_space_asm_generic.S	standard
+arm/arm/busdma_bufalloc.c	standard
 arm/arm/busdma_machdep.c 	optional	cpu_arm9 | cpu_arm9e | cpu_fa526 | cpu_sa1100 | cpu_sa1110 | cpu_xscale_80219 | cpu_xscale_80321 | cpu_xscale_81342 | cpu_xscale_ixp425 | cpu_xscale_ixp435 | cpu_xscale_pxa2x0
 arm/arm/busdma_machdep-v6.c 	optional	cpu_arm11 | cpu_cortexa | cpu_mv_pj4b
 arm/arm/copystr.S		standard

--=-nhYas3YqMWIkbv0iBerz
Content-Disposition: attachment; filename="arm_busdma_bufpools.diff"
Content-Type: text/x-patch; name="arm_busdma_bufpools.diff"; charset="us-ascii"
Content-Transfer-Encoding: 7bit

# HG changeset patch
# User Ian Lepore <freebsd@damnhippie.dyndns.org>
# Date 1346768844 21600
# Node ID 919b6b57a222c46f2346339391238bf2634bc51b
# Parent  a84eaa62a91f0993e70448863192b7a057c227b8
Busdma enhancements, especially for managing small uncacheable buffers.

- Use the new architecture-agnostic buffer pool manager that uses uma(9)
  to manage a set of power-of-2 sized buffers for bus_dmamem_alloc().

- Create pools of buffers backed by both regular and uncacheable memory,
  and use them to handle regular versus BUS_DMA_COHERENT allocations.

- Use uma(9) to manage a pool of bus_dmamap structs instead of local code
  to manage a static list of 500 items (it took 3300 maps to get to
  multi-user mode, so the static pool wasn't much of an optimization).

- Small BUS_DMA_COHERENT allocations no longer waste an entire page per
  allocation, or set pages to uncached when they contain data other than
  DMA buffers.  There's no longer a need for drivers to work around the
  inefficiency by allocing large buffers then sub-dividing them.

- Because we know the alignment and padding of buffers allocated by
  bus_dmamem_alloc() (whether coherent or regular memory, and whether
  obtained from the pool allocator or directly from the kernel) we
  can avoid doing partial cacheline flushes on them.

- Add a fast-out to _bus_dma_could_bounce() (and some comments about
  what the routine really does because the old misplaced comment was wrong).

- Everywhere the dma tag alignment is used, the interpretation is that
  an alignment of 1 means no special alignment.  If the tag is created
  with an alignment argument of zero, store it in the tag as one, and
  remove all the code scattered around that changed 0->1 at point of use.

- Remove stack-allocated arrays of segments, use a local array of two
  segments within the tag struct, or dynamically allocate an array at first
  use if nsegments > 2.  On an arm system I tested, only 5 of 97 tags used
  more than two segments.  On my x86 desktop it was only 7 of 111 tags.

diff -r a84eaa62a91f -r 919b6b57a222 sys/arm/arm/busdma_machdep.c
--- sys/arm/arm/busdma_machdep.c	Tue Sep 04 08:23:07 2012 -0600
+++ sys/arm/arm/busdma_machdep.c	Tue Sep 04 08:27:24 2012 -0600
@@ -1,4 +1,5 @@
 /*-
+ * Copyright (c) 2012 Ian Lepore
  * Copyright (c) 2004 Olivier Houchard
  * Copyright (c) 2002 Peter Grehan
  * Copyright (c) 1997, 1998 Justin T. Gibbs.
@@ -32,7 +33,26 @@
 __FBSDID("$FreeBSD: head/sys/arm/arm/busdma_machdep.c 236991 2012-06-13 04:59:55Z imp $");
 
 /*
- * ARM bus dma support routines
+ * ARM bus dma support routines.
+ *
+ * XXX Things to investigate / fix some day...
+ *  - What is the earliest that this API can be called?  Could there be any
+ *    fallout from changing the SYSINIT() order from SI_SUB_VM to SI_SUB_KMEM?
+ *  - The manpage mentions the BUS_DMA_NOWAIT flag only in the context of the
+ *    bus_dmamap_load() function.  This code has historically (and still does)
+ *    honor it in bus_dmamem_alloc().  If we got rid of that we could lose some
+ *    error checking because some resource management calls would become WAITOK
+ *    and thus "cannot fail."
+ *  - The decisions made by _bus_dma_can_bounce() should be made once, at tag
+ *    creation time, and the result stored in the tag.
+ *  - DMAMAP_COHERENT flag is probably obsolete at this point, and also the
+ *    complex code for sniffing out pages that have cache disabled by some
+ *    mechanism other than bus_dmamem_alloc().
+ *  - It should be possible to take some shortcuts when mapping a buffer we know
+ *    came from the uma(9) allocators based on what we know about such buffers
+ *    (aligned, contiguous, etc).
+ *  - The allocation of bounce pages could probably be cleaned up, then we could
+ *    retire arm_remap_nocache().
  */
 
 #define _ARM32_BUS_DMA_PRIVATE
@@ -50,12 +70,16 @@
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 
+#include <vm/uma.h>
 #include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 
 #include <machine/atomic.h>
 #include <machine/bus.h>
+#include <machine/busdma_bufalloc.h>
 #include <machine/cpufunc.h>
 #include <machine/md_var.h>
 
@@ -90,6 +114,13 @@ struct bus_dma_tag {
 	struct arm32_dma_range	*ranges;
 	int			_nranges;
 	struct bounce_zone *bounce_zone;
+	/*
+	 * Most tags need one or two segments, and can use the local tagsegs
+	 * array.  For tags with a larger limit, we'll allocate a bigger array
+	 * on first use.
+	 */
+	bus_dma_segment_t	*segments;
+	bus_dma_segment_t	tagsegs[2];
 };
 
 struct bounce_page {
@@ -133,7 +164,7 @@ SYSCTL_INT(_hw_busdma, OID_AUTO, total_b
 #define DMAMAP_LINEAR		0x1
 #define DMAMAP_MBUF		0x2
 #define DMAMAP_UIO		0x4
-#define DMAMAP_ALLOCATED	0x10
+#define DMAMAP_CACHE_ALIGNED	0x10
 #define DMAMAP_TYPE_MASK	(DMAMAP_LINEAR|DMAMAP_MBUF|DMAMAP_UIO)
 #define DMAMAP_COHERENT		0x8
 struct bus_dmamap {
@@ -143,9 +174,6 @@ struct bus_dmamap {
         bus_dma_tag_t	dmat;
 	int		flags;
 	void 		*buffer;
-	void		*origbuffer;
-	void		*allocbuffer;
-	TAILQ_ENTRY(bus_dmamap)	freelist;
 	int		len;
 	STAILQ_ENTRY(bus_dmamap) links;
 	bus_dmamap_callback_t *callback;
@@ -156,12 +184,6 @@ struct bus_dmamap {
 static STAILQ_HEAD(, bus_dmamap) bounce_map_waitinglist;
 static STAILQ_HEAD(, bus_dmamap) bounce_map_callbacklist;
 
-static TAILQ_HEAD(,bus_dmamap) dmamap_freelist =
-	TAILQ_HEAD_INITIALIZER(dmamap_freelist);
-
-#define BUSDMA_STATIC_MAPS	500
-static struct bus_dmamap map_pool[BUSDMA_STATIC_MAPS];
-
 static struct mtx busdma_mtx;
 
 MTX_SYSINIT(busdma_mtx, &busdma_mtx, "busdma lock", MTX_DEF);
@@ -178,6 +200,89 @@ static void free_bounce_page(bus_dma_tag
 /* Default tag, as most drivers provide no parent tag. */
 bus_dma_tag_t arm_root_dma_tag;
 
+//----------------------------------------------------------------------------
+// Begin block of code useful to transplant to other implementations.
+
+static struct bus_dmamap coherent_dmamap; /* Dummy; for coherent buffers */
+
+static uma_zone_t dmamap_zone;	/* Cache of struct bus_dmamap items */
+
+static busdma_bufalloc_t coherent_allocator;	/* Cache of coherent buffers */
+static busdma_bufalloc_t standard_allocator;	/* Cache of standard buffers */
+
+/*
+ * This is the ctor function passed to uma_zcreate() for the pool of dma maps.
+ * It'll need platform-specific changes if this code is copied.
+ */
+static int
+dmamap_ctor(void *mem, int size, void *arg, int flags)
+{
+	bus_dmamap_t map;
+	bus_dma_tag_t dmat;
+
+	map = (bus_dmamap_t)mem;
+	dmat = (bus_dma_tag_t)arg;
+
+	dmat->map_count++;
+
+	map->dmat = dmat;
+	map->flags = 0;
+	STAILQ_INIT(&map->bpages);
+
+	return (0);
+}
+
+/*
+ * This is the dtor function passed to uma_zcreate() for the pool of dma maps.
+ * It may need platform-specific changes if this code is copied              .
+ */
+static void 
+dmamap_dtor(void *mem, int size, void *arg)
+{
+	bus_dmamap_t map;
+
+	map = (bus_dmamap_t)mem;
+
+	map->dmat->map_count--;
+}
+
+static void
+busdma_init(void *dummy)
+{
+
+	/* Create a cache of maps for bus_dmamap_create(). */
+	dmamap_zone = uma_zcreate("dma maps", sizeof(struct bus_dmamap),
+	    dmamap_ctor, dmamap_dtor, NULL, NULL, UMA_ALIGN_PTR, 0);
+
+	/* Create a cache of buffers in standard (cacheable) memory. */
+	standard_allocator = busdma_bufalloc_create("buffer", 
+	    arm_dcache_align,	/* minimum_alignment */
+	    NULL,		/* uma_alloc func */ 
+	    NULL,		/* uma_free func */
+	    0);			/* uma_zcreate_flags */
+
+	/*
+	 * Create a cache of buffers in uncacheable memory, to implement the
+	 * BUS_DMA_COHERENT (and potentially BUS_DMA_NOCACHE) flag.
+	 */
+	coherent_allocator = busdma_bufalloc_create("coherent",
+	    arm_dcache_align,	/* minimum_alignment */
+	    busdma_bufalloc_alloc_uncacheable, 
+	    busdma_bufalloc_free_uncacheable, 
+	    0);			/* uma_zcreate_flags */
+}
+
+/*
+ * This init historically used SI_SUB_VM, but now the init code requires
+ * malloc(9) using M_DEVBUF memory, which is set up later than SI_SUB_VM, by
+ * SI_SUB_KMEM and SI_ORDER_SECOND, so we'll go right after that by using
+ * SI_SUB_KMEM and SI_ORDER_THIRD.
+ */
+SYSINIT(busdma, SI_SUB_KMEM, SI_ORDER_THIRD, busdma_init, NULL);
+
+// End block of code useful to transplant to other implementations.
+//----------------------------------------------------------------------------
+
 /*
  * Return true if a match is made.
  *
@@ -205,30 +310,26 @@ run_filter(bus_dma_tag_t dmat, bus_addr_
 	return (retval);
 }
 
-static void
-arm_dmamap_freelist_init(void *dummy)
-{
-	int i;
-
-	for (i = 0; i < BUSDMA_STATIC_MAPS; i++)
-		TAILQ_INSERT_HEAD(&dmamap_freelist, &map_pool[i], freelist);
-}
-
-SYSINIT(busdma, SI_SUB_VM, SI_ORDER_ANY, arm_dmamap_freelist_init, NULL);
-
 /*
- * Check to see if the specified page is in an allowed DMA range.
+ * This routine checks the exclusion zone constraints from a tag against the
+ * physical RAM available on the machine.  If a tag specifies an exclusion zone
+ * but there's no RAM in that zone, then we avoid allocating resources to bounce
+ * a request, and we can use any memory allocator (as opposed to needing
+ * kmem_alloc_contig() just because it can allocate pages in an address range).
+ *
+ * Most tags have BUS_SPACE_MAXADDR or BUS_SPACE_MAXADDR_32BIT (they are the
+ * same value on 32-bit architectures) as their lowaddr constraint, and we can't
+ * possibly have RAM at an address higher than the highest address we can
+ * express, so we take a fast out.
  */
-
-static __inline int
-bus_dmamap_load_buffer(bus_dma_tag_t dmat, bus_dma_segment_t *segs,
-    bus_dmamap_t map, void *buf, bus_size_t buflen, struct pmap *pmap,
-    int flags, vm_offset_t *lastaddrp, int *segp);
-
 static __inline int
 _bus_dma_can_bounce(vm_offset_t lowaddr, vm_offset_t highaddr)
 {
 	int i;
+
+	if (lowaddr >= BUS_SPACE_MAXADDR)
+		return (0);
+
 	for (i = 0; phys_avail[i] && phys_avail[i + 1]; i += 2) {
 		if ((lowaddr >= phys_avail[i] && lowaddr <= phys_avail[i + 1])
 		    || (lowaddr < phys_avail[i] &&
@@ -293,38 +394,6 @@ dflt_lock(void *arg, bus_dma_lock_op_t o
 #endif
 }
 
-static __inline bus_dmamap_t
-_busdma_alloc_dmamap(void)
-{
-	bus_dmamap_t map;
-
-	mtx_lock(&busdma_mtx);
-	map = TAILQ_FIRST(&dmamap_freelist);
-	if (map)
-		TAILQ_REMOVE(&dmamap_freelist, map, freelist);
-	mtx_unlock(&busdma_mtx);
-	if (!map) {
-		map = malloc(sizeof(*map), M_DEVBUF, M_NOWAIT | M_ZERO);
-		if (map)
-			map->flags = DMAMAP_ALLOCATED;
-	} else
-		map->flags = 0;
-	STAILQ_INIT(&map->bpages);
-	return (map);
-}
-
-static __inline void
-_busdma_free_dmamap(bus_dmamap_t map)
-{
-	if (map->flags & DMAMAP_ALLOCATED)
-		free(map, M_DEVBUF);
-	else {
-		mtx_lock(&busdma_mtx);
-		TAILQ_INSERT_HEAD(&dmamap_freelist, map, freelist);
-		mtx_unlock(&busdma_mtx);
-	}
-}
-
 /*
  * Allocate a device specific dma_tag.
  */
@@ -353,7 +422,7 @@ bus_dma_tag_create(bus_dma_tag_t parent,
 	}
 
 	newtag->parent = parent;
-	newtag->alignment = alignment;
+	newtag->alignment = alignment ? alignment : 1;
 	newtag->boundary = boundary;
 	newtag->lowaddr = trunc_page((vm_offset_t)lowaddr) + (PAGE_SIZE - 1);
 	newtag->highaddr = trunc_page((vm_offset_t)highaddr) + (PAGE_SIZE - 1);
@@ -374,7 +443,19 @@ bus_dma_tag_create(bus_dma_tag_t parent,
 		newtag->lockfunc = dflt_lock;
 		newtag->lockfuncarg = NULL;
 	}
-        /*
+	/*
+	 * If all the segments we need fit into the local tagsegs array, set the
+	 * pointer now.  Otherwise NULL the pointer and an array of segments
+	 * will be allocated later, on first use.  We don't pre-allocate now
+	 * because some tags exist just to pass contraints to children in the
+	 * device hierarchy, and they tend to use BUS_SPACE_UNRESTRICTED and we
+	 * sure don't want to try to allocate an array for that.
+	 */
+	if (newtag->nsegments <= nitems(newtag->tagsegs))
+		newtag->segments = newtag->tagsegs;
+	else
+		newtag->segments = NULL;
+	/*
 	 * Take into account any restrictions imposed by our parent tag
 	 */
         if (parent != NULL) {
@@ -457,6 +538,8 @@ bus_dma_tag_destroy(bus_dma_tag_t dmat)
                         parent = dmat->parent;
                         atomic_subtract_int(&dmat->ref_count, 1);
                         if (dmat->ref_count == 0) {
+				if (dmat->segments != dmat->tagsegs)
+					free(dmat->segments, M_DEVBUF);
                                 free(dmat, M_DEVBUF);
                                 /*
                                  * Last reference count, so
@@ -481,18 +564,19 @@ bus_dma_tag_destroy(bus_dma_tag_t dmat)
 int
 bus_dmamap_create(bus_dma_tag_t dmat, int flags, bus_dmamap_t *mapp)
 {
-	bus_dmamap_t newmap;
+	bus_dmamap_t map;
 	int error = 0;
 
-	newmap = _busdma_alloc_dmamap();
-	if (newmap == NULL) {
-		CTR3(KTR_BUSDMA, "%s: tag %p error %d", __func__, dmat, ENOMEM);
-		return (ENOMEM);
-	}
-	*mapp = newmap;
-	newmap->dmat = dmat;
-	newmap->allocbuffer = NULL;
-	dmat->map_count++;
+	map = uma_zalloc_arg(dmamap_zone, dmat, M_WAITOK);
+	*mapp = map;
+
+	/*
+	 * If the tag's segments haven't been allocated yet we need to do it
+	 * now, because we can't sleep for resources at map load time.
+	 */
+	if (dmat->segments == NULL)
+		dmat->segments = malloc(dmat->nsegments * 
+		    sizeof(*dmat->segments), M_DEVBUF, M_WAITOK);
 
 	/*
 	 * Bouncing might be required if the driver asks for an active
@@ -507,7 +591,7 @@ bus_dmamap_create(bus_dma_tag_t dmat, in
 
 		if (dmat->bounce_zone == NULL) {
 			if ((error = alloc_bounce_zone(dmat)) != 0) {
-				_busdma_free_dmamap(newmap);
+				uma_zfree(dmamap_zone, map);
 				*mapp = NULL;
 				return (error);
 			}
@@ -560,108 +644,129 @@ bus_dmamap_destroy(bus_dma_tag_t dmat, b
 		    __func__, dmat, EBUSY);
 		return (EBUSY);
 	}
-	_busdma_free_dmamap(map);
+	uma_zfree(dmamap_zone, map);
 	if (dmat->bounce_zone)
 		dmat->bounce_zone->map_count--;
-        dmat->map_count--;
 	CTR2(KTR_BUSDMA, "%s: tag %p error 0", __func__, dmat);
         return (0);
 }
 
 /*
- * Allocate a piece of memory that can be efficiently mapped into
- * bus device space based on the constraints lited in the dma tag.
- * A dmamap to for use with dmamap_load is also allocated.
+ * Allocate a piece of memory that can be efficiently mapped into bus device
+ * space based on the constraints listed in the dma tag.  Returns a pointer to
+ * the allocated memory, and a pointer to an associated bus_dmamap.
  */
 int
-bus_dmamem_alloc(bus_dma_tag_t dmat, void** vaddr, int flags,
+bus_dmamem_alloc(bus_dma_tag_t dmat, void **vaddrp, int flags,
                  bus_dmamap_t *mapp)
 {
-	bus_dmamap_t newmap = NULL;
-
+	void * vaddr;
+	struct busdma_bufzone *bufzone;
+	busdma_bufalloc_t ba;
+	bus_dmamap_t map;
 	int mflags;
+	vm_memattr_t memattr;
 
 	if (flags & BUS_DMA_NOWAIT)
 		mflags = M_NOWAIT;
 	else
 		mflags = M_WAITOK;
+
+	/*
+	 * If the tag's segments haven't been allocated yet we need to do it
+	 * now, because we can't sleep for resources at map load time.
+	 */
+	if (dmat->segments == NULL)
+		dmat->segments = malloc(dmat->nsegments * 
+		   sizeof(*dmat->segments), M_DEVBUF, mflags);
+
+	if (flags & BUS_DMA_COHERENT) {
+		memattr = VM_MEMATTR_UNCACHEABLE;
+		ba = coherent_allocator;
+		map = &coherent_dmamap;
+	} else {
+		memattr = VM_MEMATTR_DEFAULT;
+		ba = standard_allocator;
+		map = uma_zalloc_arg(dmamap_zone, dmat, mflags);
+		if (map == NULL)
+			return (ENOMEM);
+		/* All buffers we allocate are cache-aligned. */
+		map->flags |= DMAMAP_CACHE_ALIGNED;
+	}
+
 	if (flags & BUS_DMA_ZERO)
 		mflags |= M_ZERO;
 
-	newmap = _busdma_alloc_dmamap();
-	if (newmap == NULL) {
-		CTR4(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d",
-		    __func__, dmat, dmat->flags, ENOMEM);
-		return (ENOMEM);
+	/*
+	 * Try to find a bufzone in the allocator that holds a cache of buffers
+	 * of the right size for this request.  If the buffer is too big to be
+	 * held in the allocator cache, this returns NULL.
+	 */
+	bufzone = busdma_bufalloc_findzone(ba, dmat->maxsize);
+
+	/*
+	 * Allocate the buffer from the uma(9) allocator if...
+	 *  - It's small enough to be in the allocator (bufzone not NULL).
+	 *  - The alignment constraint isn't larger than the allocation size
+	 *    (the allocator aligns buffers to their size boundaries).
+	 *  - There's no need to handle lowaddr/highaddr exclusion zones.
+	 * else allocate non-contiguous pages if...
+	 *  - The page count that could get allocated doesn't exceed nsegments.
+	 *  - The alignment constraint isn't larger than a page boundary.
+	 *  - There are no boundary-crossing constraints.
+	 * else allocate a block of contiguous pages because one or more of the
+	 * constraints is something that only the contig allocator can fulfill.
+	 */
+	if (bufzone != NULL && dmat->alignment <= bufzone->size &&
+	    !_bus_dma_can_bounce(dmat->lowaddr, dmat->highaddr)) {
+		vaddr = uma_zalloc(bufzone->umazone, mflags);
+	} else if (dmat->nsegments >= btoc(dmat->maxsize) &&
+	    dmat->alignment <= PAGE_SIZE && dmat->boundary == 0) {
+		vaddr = (void *)kmem_alloc_attr(kernel_map, dmat->maxsize,
+		    mflags, 0, dmat->lowaddr, memattr);
+	} else {
+		vaddr = (void *)kmem_alloc_contig(kernel_map, dmat->maxsize,
+		    mflags, 0, dmat->lowaddr, dmat->alignment, dmat->boundary,
+		    memattr);
 	}
-	dmat->map_count++;
-	*mapp = newmap;
-	newmap->dmat = dmat;
-	
-        if (dmat->maxsize <= PAGE_SIZE &&
-	   (dmat->alignment < dmat->maxsize) &&
-	   !_bus_dma_can_bounce(dmat->lowaddr, dmat->highaddr)) {
-                *vaddr = malloc(dmat->maxsize, M_DEVBUF, mflags);
-        } else {
-                /*
-                 * XXX Use Contigmalloc until it is merged into this facility
-                 *     and handles multi-seg allocations.  Nobody is doing
-                 *     multi-seg allocations yet though.
-                 */
-                *vaddr = contigmalloc(dmat->maxsize, M_DEVBUF, mflags,
-                    0ul, dmat->lowaddr, dmat->alignment? dmat->alignment : 1ul,
-                    dmat->boundary);
-        }
-        if (*vaddr == NULL) {
-		if (newmap != NULL) {
-			_busdma_free_dmamap(newmap);
-			dmat->map_count--;
-		}
-		*mapp = NULL;
-                return (ENOMEM);
+
+	if (vaddr == NULL && map != &coherent_dmamap) {
+		uma_zfree(dmamap_zone, map);
+		map = NULL;
 	}
-	if (flags & BUS_DMA_COHERENT) {
-		void *tmpaddr = arm_remap_nocache(
-		    (void *)((vm_offset_t)*vaddr &~ PAGE_MASK),
-		    dmat->maxsize + ((vm_offset_t)*vaddr & PAGE_MASK));
 
-		if (tmpaddr) {
-			tmpaddr = (void *)((vm_offset_t)(tmpaddr) +
-			    ((vm_offset_t)*vaddr & PAGE_MASK));
-			newmap->origbuffer = *vaddr;
-			newmap->allocbuffer = tmpaddr;
-			*vaddr = tmpaddr;
-		} else
-			newmap->origbuffer = newmap->allocbuffer = NULL;
-	} else
-		newmap->origbuffer = newmap->allocbuffer = NULL;
-        return (0);
+	*vaddrp = vaddr;
+	*mapp = map;
+
+	return (vaddr == NULL ? ENOMEM : 0);
 }
 
 /*
- * Free a piece of memory and it's allocated dmamap, that was allocated
- * via bus_dmamem_alloc.  Make the same choice for free/contigfree.
+ * Free a piece of memory that was allocated via bus_dmamem_alloc, along with
+ * its associated map.
  */
 void
 bus_dmamem_free(bus_dma_tag_t dmat, void *vaddr, bus_dmamap_t map)
 {
-	if (map->allocbuffer) {
-		KASSERT(map->allocbuffer == vaddr,
-		    ("Trying to freeing the wrong DMA buffer"));
-		vaddr = map->origbuffer;
-		arm_unmap_nocache(map->allocbuffer,
-		    dmat->maxsize + ((vm_offset_t)vaddr & PAGE_MASK));
+	struct busdma_bufzone *bufzone;
+	busdma_bufalloc_t ba;
+
+	if (map == &coherent_dmamap) {
+		ba = coherent_allocator;
+	} else {
+		ba = standard_allocator;
+		uma_zfree(dmamap_zone, map);
 	}
-        if (dmat->maxsize <= PAGE_SIZE &&
-	   dmat->alignment < dmat->maxsize &&
+
+	/* Be careful not to access map from here on. */
+
+	bufzone = busdma_bufalloc_findzone(ba, dmat->maxsize);
+
+	if (bufzone != NULL && dmat->alignment <= bufzone->size &&
 	    !_bus_dma_can_bounce(dmat->lowaddr, dmat->highaddr))
-		free(vaddr, M_DEVBUF);
-        else {
-		contigfree(vaddr, dmat->maxsize, M_DEVBUF);
-	}
-	dmat->map_count--;
-	_busdma_free_dmamap(map);
-	CTR3(KTR_BUSDMA, "%s: tag %p flags 0x%x", __func__, dmat, dmat->flags);
+		uma_zfree(bufzone->umazone, vaddr);
+	else
+		kmem_free(kernel_map, (vm_offset_t)vaddr, dmat->maxsize);
 }
 
 static int
@@ -883,11 +988,6 @@ bus_dmamap_load(bus_dma_tag_t dmat, bus_
 {
      	vm_offset_t	lastaddr = 0;
 	int		error, nsegs = -1;
-#ifdef __CC_SUPPORTS_DYNAMIC_ARRAY_INIT
-	bus_dma_segment_t dm_segments[dmat->nsegments];
-#else
-	bus_dma_segment_t dm_segments[BUS_DMAMAP_NSEGS];
-#endif
 
 	KASSERT(dmat != NULL, ("dmatag is NULL"));
 	KASSERT(map != NULL, ("dmamap is NULL"));
@@ -898,14 +998,14 @@ bus_dmamap_load(bus_dma_tag_t dmat, bus_
 	map->buffer = buf;
 	map->len = buflen;
 	error = bus_dmamap_load_buffer(dmat,
-	    dm_segments, map, buf, buflen, kernel_pmap,
+	    dmat->segments, map, buf, buflen, kernel_pmap,
 	    flags, &lastaddr, &nsegs);
 	if (error == EINPROGRESS)
 		return (error);
 	if (error)
 		(*callback)(callback_arg, NULL, 0, error);
 	else
-		(*callback)(callback_arg, dm_segments, nsegs + 1, error);
+		(*callback)(callback_arg, dmat->segments, nsegs + 1, error);
 	
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
 	    __func__, dmat, dmat->flags, nsegs + 1, error);
@@ -915,23 +1015,28 @@ bus_dmamap_load(bus_dma_tag_t dmat, bus_
 
 /*
  * Like bus_dmamap_load(), but for mbufs.
+ *
+ * Note that the manpage states that BUS_DMA_NOWAIT is implied for mbufs.
+ *
+ * We know that the way the system allocates and uses mbufs implies that we can
+ * treat them as DMAMAP_CACHE_ALIGNED in terms of handling partial cache line
+ * flushes. Even though the flush may reference the data area within the mbuf
+ * that isn't aligned to a cache line, we know the overall mbuf itself is
+ * properly aligned, and we know that the CPU will not touch the header fields
+ * in front of the data area while the DMA is in progress.
  */
 int
 bus_dmamap_load_mbuf(bus_dma_tag_t dmat, bus_dmamap_t map, struct mbuf *m0,
 		     bus_dmamap_callback2_t *callback, void *callback_arg,
 		     int flags)
 {
-#ifdef __CC_SUPPORTS_DYNAMIC_ARRAY_INIT
-	bus_dma_segment_t dm_segments[dmat->nsegments];
-#else
-	bus_dma_segment_t dm_segments[BUS_DMAMAP_NSEGS];
-#endif
 	int nsegs = -1, error = 0;
 
 	M_ASSERTPKTHDR(m0);
 
+	flags |= BUS_DMA_NOWAIT;
 	map->flags &= ~DMAMAP_TYPE_MASK;
-	map->flags |= DMAMAP_MBUF | DMAMAP_COHERENT;
+	map->flags |= DMAMAP_MBUF | DMAMAP_COHERENT | DMAMAP_CACHE_ALIGNED;
 	map->buffer = m0;
 	map->len = 0;
 	if (m0->m_pkthdr.len <= dmat->maxsize) {
@@ -941,7 +1046,7 @@ bus_dmamap_load_mbuf(bus_dma_tag_t dmat,
 		for (m = m0; m != NULL && error == 0; m = m->m_next) {
 			if (m->m_len > 0) {
 				error = bus_dmamap_load_buffer(dmat,
-				    dm_segments, map, m->m_data, m->m_len,
+				    dmat->segments, map, m->m_data, m->m_len,
 				    pmap_kernel(), flags, &lastaddr, &nsegs);
 				map->len += m->m_len;
 			}
@@ -954,9 +1059,9 @@ bus_dmamap_load_mbuf(bus_dma_tag_t dmat,
 		/*
 		 * force "no valid mappings" on error in callback.
 		 */
-		(*callback)(callback_arg, dm_segments, 0, 0, error);
+		(*callback)(callback_arg, NULL, 0, 0, error);
 	} else {
-		(*callback)(callback_arg, dm_segments, nsegs + 1,
+		(*callback)(callback_arg, dmat->segments, nsegs + 1,
 		    m0->m_pkthdr.len, error);
 	}
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
@@ -976,7 +1081,7 @@ bus_dmamap_load_mbuf_sg(bus_dma_tag_t dm
 	flags |= BUS_DMA_NOWAIT;
 	*nsegs = -1;
 	map->flags &= ~DMAMAP_TYPE_MASK;
-	map->flags |= DMAMAP_MBUF | DMAMAP_COHERENT;
+	map->flags |= DMAMAP_MBUF | DMAMAP_COHERENT | DMAMAP_CACHE_ALIGNED;
 	map->buffer = m0;			
 	map->len = 0;
 	if (m0->m_pkthdr.len <= dmat->maxsize) {
@@ -1012,11 +1117,6 @@ bus_dmamap_load_uio(bus_dma_tag_t dmat, 
     int flags)
 {
 	vm_offset_t lastaddr = 0;
-#ifdef __CC_SUPPORTS_DYNAMIC_ARRAY_INIT
-	bus_dma_segment_t dm_segments[dmat->nsegments];
-#else
-	bus_dma_segment_t dm_segments[BUS_DMAMAP_NSEGS];
-#endif
 	int nsegs, i, error;
 	bus_size_t resid;
 	struct iovec *iov;
@@ -1048,8 +1148,8 @@ bus_dmamap_load_uio(bus_dma_tag_t dmat, 
 		caddr_t addr = (caddr_t) iov[i].iov_base;
 
 		if (minlen > 0) {
-			error = bus_dmamap_load_buffer(dmat, dm_segments, map,
-			    addr, minlen, pmap, flags, &lastaddr, &nsegs);
+			error = bus_dmamap_load_buffer(dmat, dmat->segments, 
+			    map, addr, minlen, pmap, flags, &lastaddr, &nsegs);
 
 			map->len += minlen;
 			resid -= minlen;
@@ -1060,9 +1160,9 @@ bus_dmamap_load_uio(bus_dma_tag_t dmat, 
 		/*
 		 * force "no valid mappings" on error in callback.
 		 */
-		(*callback)(callback_arg, dm_segments, 0, 0, error);
+		(*callback)(callback_arg, NULL, 0, 0, error);
 	} else {
-		(*callback)(callback_arg, dm_segments, nsegs+1,
+		(*callback)(callback_arg, dmat->segments, nsegs+1,
 		    uio->uio_resid, error);
 	}
 
@@ -1088,7 +1188,7 @@ void
 }
 
 static void
-bus_dmamap_sync_buf(void *buf, int len, bus_dmasync_op_t op)
+bus_dmamap_sync_buf(void *buf, int len, bus_dmasync_op_t op, int bufaligned)
 {
 	char _tmp_cl[arm_dcache_align], _tmp_clend[arm_dcache_align];
 	register_t s;
@@ -1098,7 +1198,25 @@ bus_dmamap_sync_buf(void *buf, int len, 
 		cpu_dcache_wb_range((vm_offset_t)buf, len);
 		cpu_l2cache_wb_range((vm_offset_t)buf, len);
 	}
+
+	/*
+	 * If the caller promises the buffer is properly aligned to a cache line
+	 * (even if the call parms make it look like it isn't) we can avoid
+	 * attempting to preserve the non-DMA part of the cache line in the
+	 * POSTREAD case, but we MUST still do a writeback in the PREREAD case.
+	 *
+	 * This covers the case of mbufs, where we know how they're aligned and
+	 * know the CPU doesn't touch the header in front of the DMA data area
+	 * during the IO, but it may have touched it right before invoking the
+	 * sync, so a PREREAD writeback is required.
+	 *
+	 * It also handles buffers we created in bus_dmamem_alloc(), which are
+	 * always aligned and padded to cache line size even if the IO length
+	 * isn't a multiple of cache line size.  In this case the PREREAD
+	 * writeback probably isn't required, but it's harmless.
+	 */
 	partial = (((vm_offset_t)buf) | len) & arm_dcache_align_mask;
+
 	if (op & BUS_DMASYNC_PREREAD) {
 		if (!(op & BUS_DMASYNC_PREWRITE) && !partial) {
 			cpu_dcache_inv_range((vm_offset_t)buf, len);
@@ -1109,7 +1227,7 @@ bus_dmamap_sync_buf(void *buf, int len, 
 		}
 	}
 	if (op & BUS_DMASYNC_POSTREAD) {
-		if (partial) {
+		if (partial && !bufaligned) {
 			s = intr_disable();
 			if ((vm_offset_t)buf & arm_dcache_align_mask)
 				memcpy(_tmp_cl, (void *)((vm_offset_t)buf &
@@ -1123,7 +1241,7 @@ bus_dmamap_sync_buf(void *buf, int len, 
 		}
 		cpu_dcache_inv_range((vm_offset_t)buf, len);
 		cpu_l2cache_inv_range((vm_offset_t)buf, len);
-		if (partial) {
+		if (partial && !bufaligned) {
 			if ((vm_offset_t)buf & arm_dcache_align_mask)
 				memcpy((void *)((vm_offset_t)buf &
 				    ~arm_dcache_align_mask), _tmp_cl,
@@ -1194,25 +1312,29 @@ void
 	struct uio *uio;
 	int resid;
 	struct iovec *iov;
-	
+	int bufaligned;
+
 	if (op == BUS_DMASYNC_POSTWRITE)
 		return;
+	if (map == &coherent_dmamap)
+		goto drain;
 	if (STAILQ_FIRST(&map->bpages))
 		_bus_dmamap_sync_bp(dmat, map, op);
-	if (map->flags & DMAMAP_COHERENT)
-		return;
 	CTR3(KTR_BUSDMA, "%s: op %x flags %x", __func__, op, map->flags);
+	bufaligned = (map->flags & DMAMAP_CACHE_ALIGNED);
 	switch(map->flags & DMAMAP_TYPE_MASK) {
 	case DMAMAP_LINEAR:
 		if (!(_bus_dma_buf_is_in_bp(map, map->buffer, map->len)))
-			bus_dmamap_sync_buf(map->buffer, map->len, op);
+			bus_dmamap_sync_buf(map->buffer, map->len, op, 
+			    bufaligned);
 		break;
 	case DMAMAP_MBUF:
 		m = map->buffer;
 		while (m) {
 			if (m->m_len > 0 &&
 			    !(_bus_dma_buf_is_in_bp(map, m->m_data, m->m_len)))
-				bus_dmamap_sync_buf(m->m_data, m->m_len, op);
+				bus_dmamap_sync_buf(m->m_data, m->m_len, op,
+				bufaligned);
 			m = m->m_next;
 		}
 		break;
@@ -1227,7 +1349,7 @@ void
 				if (!_bus_dma_buf_is_in_bp(map, iov[i].iov_base,
 				    minlen))
 					bus_dmamap_sync_buf(iov[i].iov_base,
-					    minlen, op);
+					    minlen, op, bufaligned);
 				resid -= minlen;
 			}
 		}
@@ -1235,6 +1357,9 @@ void
 	default:
 		break;
 	}
+
+drain:
+
 	cpu_drain_writebuf();
 }
 

--=-nhYas3YqMWIkbv0iBerz--




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?1346777897.1140.633.camel>