From owner-freebsd-arm@FreeBSD.ORG  Tue Aug 28 01:30:11 2012
Return-Path: <owner-freebsd-arm@FreeBSD.ORG>
Delivered-To: freebsd-arm@hub.freebsd.org
Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34])
	by hub.freebsd.org (Postfix) with ESMTP id 308951065670
	for <freebsd-arm@hub.freebsd.org>; Tue, 28 Aug 2012 01:30:11 +0000 (UTC)
	(envelope-from gnats@FreeBSD.org)
Received: from freefall.freebsd.org (freefall.freebsd.org
	[IPv6:2001:4f8:fff6::28])
	by mx1.freebsd.org (Postfix) with ESMTP id 191788FC1A
	for <freebsd-arm@hub.freebsd.org>; Tue, 28 Aug 2012 01:30:11 +0000 (UTC)
Received: from freefall.freebsd.org (localhost [127.0.0.1])
	by freefall.freebsd.org (8.14.5/8.14.5) with ESMTP id q7S1UAcj057819
	for <freebsd-arm@freefall.freebsd.org>; Tue, 28 Aug 2012 01:30:10 GMT
	(envelope-from gnats@freefall.freebsd.org)
Received: (from gnats@localhost)
	by freefall.freebsd.org (8.14.5/8.14.5/Submit) id q7S1UAKP057816;
	Tue, 28 Aug 2012 01:30:10 GMT (envelope-from gnats)
Date: Tue, 28 Aug 2012 01:30:10 GMT
Message-Id: <201208280130.q7S1UAKP057816@freefall.freebsd.org>
To: freebsd-arm@FreeBSD.org
From: dfilter@FreeBSD.ORG (dfilter service)
Cc: 
Subject: Re: arm/155214: commit references a PR
X-BeenThere: freebsd-arm@freebsd.org
X-Mailman-Version: 2.1.5
Precedence: list
Reply-To: dfilter service <dfilter@FreeBSD.ORG>
List-Id: Porting FreeBSD to the StrongARM Processor <freebsd-arm.freebsd.org>
List-Unsubscribe: <http://lists.freebsd.org/mailman/listinfo/freebsd-arm>,
	<mailto:freebsd-arm-request@freebsd.org?subject=unsubscribe>
List-Archive: <http://lists.freebsd.org/pipermail/freebsd-arm>
List-Post: <mailto:freebsd-arm@freebsd.org>
List-Help: <mailto:freebsd-arm-request@freebsd.org?subject=help>
List-Subscribe: <http://lists.freebsd.org/mailman/listinfo/freebsd-arm>,
	<mailto:freebsd-arm-request@freebsd.org?subject=subscribe>
X-List-Received-Date: Tue, 28 Aug 2012 01:30:11 -0000

The following reply was made to PR arm/155214; it has been noted by GNATS.

From: dfilter@FreeBSD.ORG (dfilter service)
To: bug-followup@FreeBSD.org
Cc:  
Subject: Re: arm/155214: commit references a PR
Date: Tue, 28 Aug 2012 01:29:10 +0000 (UTC)

 Author: imp
 Date: Tue Aug 28 01:28:52 2012
 New Revision: 239762
 URL: http://svn.freebsd.org/changeset/base/239762
 
 Log:
   Bring in the multi-block patches for mci.  These required extensive
   restructuring of the driver.  I've tried to preserve the other silicon
   workarounds that we've added over the years, but haven't had a chance
   to extensively test on other hardware.  On my AT91RM9200 with 30MHz/1
   wire/64 block transfers, I've been able to go from ~.66MB/s to
   2.25MB/s in the simple tests I performed, almost a 3.5x improvement.
   This cuts the boot time almost in half when everything else goes
   right (timed from rtc message to login: prompt).
   
   PR:		155214
   Submitted by:	Ian Lapore
 
 Modified:
   head/sys/arm/at91/at91_mci.c
 
 Modified: head/sys/arm/at91/at91_mci.c
 ==============================================================================
 --- head/sys/arm/at91/at91_mci.c	Mon Aug 27 23:27:41 2012	(r239761)
 +++ head/sys/arm/at91/at91_mci.c	Tue Aug 28 01:28:52 2012	(r239762)
 @@ -114,7 +114,24 @@ __FBSDID("$FreeBSD$");
  #define AT91_MCI_USE_30MHZ 1
  #endif
  
 -#define BBSZ	512
 +/*
 + * Allocate 2 bounce buffers we'll use to endian-swap the data due to the rm9200
 + * erratum.  We use a pair of buffers because when reading that lets us begin
 + * endian-swapping the data in the first buffer while the DMA is reading into
 + * the second buffer.  (We can't use the same trick for writing because we might
 + * not get all the data in the 2nd buffer swapped before the hardware needs it;
 + * dealing with that would add complexity to the driver.)
 + *
 + * The buffers are sized at 16K each due to the way the busdma cache sync
 + * operations work on arm.  A dcache_inv_range() operation on a range larger
 + * than 16K gets turned into a dcache_wbinv_all().  That needlessly flushes the
 + * entire data cache, impacting overall system performance.
 + */
 +#define BBCOUNT     2
 +#define BBSIZE      (16*1024)
 +#define MAX_BLOCKS  ((BBSIZE*BBCOUNT)/512)
 +
 +static int mci_debug;
  
  struct at91_mci_softc {
  	void *intrhand;			/* Interrupt handle */
 @@ -123,21 +140,25 @@ struct at91_mci_softc {
  #define	CAP_HAS_4WIRE		1	/* Has 4 wire bus */
  #define	CAP_NEEDS_BYTESWAP	2	/* broken hardware needing bounce */
  	int flags;
 -#define CMD_STARTED	1
 -#define STOP_STARTED	2
 +#define PENDING_CMD	0x01
 +#define PENDING_STOP	0x02
 +#define CMD_MULTIREAD	0x10
 +#define CMD_MULTIWRITE	0x20
  	int has_4wire;
  	int use_30mhz;
  	struct resource *irq_res;	/* IRQ resource */
  	struct resource	*mem_res;	/* Memory resource */
  	struct mtx sc_mtx;
  	bus_dma_tag_t dmatag;
 -	bus_dmamap_t map;
 -	int mapped;
  	struct mmc_host host;
  	int bus_busy;
  	struct mmc_request *req;
  	struct mmc_command *curcmd;
 -	char bounce_buffer[BBSZ];
 +	bus_dmamap_t bbuf_map[BBCOUNT];
 +	char      *  bbuf_vaddr[BBCOUNT]; /* bounce bufs in KVA space */
 +	uint32_t     bbuf_len[BBCOUNT];	  /* len currently queued for bounce buf */
 +	uint32_t     bbuf_curidx;	  /* which bbuf is the active DMA buffer */
 +	uint32_t     xfer_offset;	  /* offset so far into caller's buf */
  };
  
  static inline uint32_t
 @@ -172,6 +193,51 @@ static int at91_mci_is_mci1rev2xx(void);
  #define AT91_MCI_ASSERT_LOCKED(_sc)	mtx_assert(&_sc->sc_mtx, MA_OWNED);
  #define AT91_MCI_ASSERT_UNLOCKED(_sc) mtx_assert(&_sc->sc_mtx, MA_NOTOWNED);
  
 +static void 
 +at91_bswap_buf(struct at91_mci_softc *sc, void * dptr, void * sptr, uint32_t memsize)
 +{
 +	uint32_t * dst = (uint32_t *)dptr;
 +	uint32_t * src = (uint32_t *)sptr;
 +	uint32_t   i;
 +
 +	/*
 +	 * If the hardware doesn't need byte-swapping, let bcopy() do the
 +	 * work.  Use bounce buffer even if we don't need byteswap, since
 +	 * buffer may straddle a page boundry, and we don't handle
 +	 * multi-segment transfers in hardware.  Seen from 'bsdlabel -w' which
 +	 * uses raw geom access to the volume.  Greg Ansley (gja (at)
 +	 * ansley.com)
 +	 */
 +	if (!(sc->sc_cap & CAP_NEEDS_BYTESWAP)) {
 +		bcopy(dptr, sptr, memsize);
 +		return;
 +	}
 +
 +	/*
 +	 * Nice performance boost for slightly unrolling this loop.
 +	 * (But very little extra boost for further unrolling it.)
 +	 */
 +	for (i = 0; i < memsize; i += 16) {
 +		*dst++ = bswap32(*src++);
 +		*dst++ = bswap32(*src++);
 +		*dst++ = bswap32(*src++);
 +		*dst++ = bswap32(*src++);
 +	}
 +
 +	/* Mop up the last 1-3 words, if any. */
 +	for (i = 0; i < (memsize & 0x0F); i += 4) {
 +		*dst++ = bswap32(*src++);
 +	}
 +}
 +
 +static void
 +at91_mci_getaddr(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
 +{
 +	if (error != 0)
 +		return;
 +	*(bus_addr_t *)arg = segs[0].ds_addr;
 +}
 +
  static void
  at91_mci_pdc_disable(struct at91_mci_softc *sc)
  {
 @@ -186,13 +252,57 @@ at91_mci_pdc_disable(struct at91_mci_sof
  	WR4(sc, PDC_TNCR, 0);
  }
  
 +/*
 + * Reset the controller, then restore most of the current state.
 + *
 + * This is called after detecting an error.  It's also called after stopping a
 + * multi-block write, to un-wedge the device so that it will handle the NOTBUSY
 + * signal correctly.  See comments in at91_mci_stop_done() for more details.
 + */
 +static void at91_mci_reset(struct at91_mci_softc *sc)
 +{
 +	uint32_t mr;
 +	uint32_t sdcr;
 +	uint32_t dtor;
 +	uint32_t imr;
 +
 +	at91_mci_pdc_disable(sc);
 +
 +	/* save current state */
 +
 +	imr  = RD4(sc, MCI_IMR);
 +	mr   = RD4(sc, MCI_MR) & 0x7fff;
 +	sdcr = RD4(sc, MCI_SDCR);
 +	dtor = RD4(sc, MCI_DTOR);
 +
 +	/* reset the controller */
 +
 +	WR4(sc, MCI_IDR, 0xffffffff);
 +	WR4(sc, MCI_CR, MCI_CR_MCIDIS | MCI_CR_SWRST);
 +
 +	/* restore state */
 +
 +	WR4(sc, MCI_CR, MCI_CR_MCIEN|MCI_CR_PWSEN);
 +	WR4(sc, MCI_MR, mr);
 +	WR4(sc, MCI_SDCR, sdcr);
 +	WR4(sc, MCI_DTOR, dtor);
 +	WR4(sc, MCI_IER, imr);
 +
 +	/*
 +	 * Make sure sdio interrupts will fire.  Not sure why reading
 +	 * SR ensures that, but this is in the linux driver.
 +	 */
 +
 +	RD4(sc, MCI_SR);
 +}
 +
  static void
  at91_mci_init(device_t dev)
  {
  	struct at91_mci_softc *sc = device_get_softc(dev);
  	uint32_t val;
  
 -	WR4(sc, MCI_CR, MCI_CR_MCIEN);		/* Enable controller */
 +	WR4(sc, MCI_CR, MCI_CR_MCIDIS | MCI_CR_SWRST); /* device into reset */
  	WR4(sc, MCI_IDR, 0xffffffff);		/* Turn off interrupts */
  	WR4(sc, MCI_DTOR, MCI_DTOR_DTOMUL_1M | 1);
  	val = MCI_MR_PDCMODE;
 @@ -203,10 +313,19 @@ at91_mci_init(device_t dev)
  #ifndef  AT91_MCI_SLOT_B
  	WR4(sc, MCI_SDCR, 0);			/* SLOT A, 1 bit bus */
  #else
 -	/* XXX Really should add second "unit" but nobody using using
 -	 * a two slot card that we know of. -- except they are... XXX */
 +	/*
 +	 * XXX Really should add second "unit" but nobody using using 
 +	 * a two slot card that we know of. XXX
 +	 */
  	WR4(sc, MCI_SDCR, 1);			/* SLOT B, 1 bit bus */
  #endif
 +	/*
 +	 * Enable controller, including power-save.  The slower clock
 +	 * of the power-save mode is only in effect when there is no
 +	 * transfer in progress, so it can be left in this mode all
 +	 * the time.
 +	 */
 +	WR4(sc, MCI_CR, MCI_CR_MCIEN|MCI_CR_PWSEN);
  }
  
  static void
 @@ -216,7 +335,7 @@ at91_mci_fini(device_t dev)
  
  	WR4(sc, MCI_IDR, 0xffffffff);		/* Turn off interrupts */
  	at91_mci_pdc_disable(sc);
 -	WR4(sc, MCI_CR, MCI_CR_MCIDIS | MCI_CR_SWRST); /* Put the device into reset */
 +	WR4(sc, MCI_CR, MCI_CR_MCIDIS | MCI_CR_SWRST); /* device into reset */
  }
  
  static int
 @@ -234,7 +353,7 @@ at91_mci_attach(device_t dev)
  	struct sysctl_ctx_list *sctx;
  	struct sysctl_oid *soid;
  	device_t child;
 -	int err;
 +	int err, i;
  
  	sctx = device_get_sysctl_ctx(dev);
  	soid = device_get_sysctl_tree(dev);
 @@ -249,21 +368,33 @@ at91_mci_attach(device_t dev)
  
  	AT91_MCI_LOCK_INIT(sc);
  
 +	at91_mci_fini(dev);
 +	at91_mci_init(dev);
 +
  	/*
 -	 * Allocate DMA tags and maps
 +	 * Allocate DMA tags and maps and bounce buffers.
 +	 *
 +	 * The parms in the tag_create call cause the dmamem_alloc call to
 +	 * create each bounce buffer as a single contiguous buffer of BBSIZE
 +	 * bytes aligned to a 4096 byte boundary.
 +	 *
 +	 * Do not use DMA_COHERENT for these buffers because that maps the
 +	 * memory as non-cachable, which prevents cache line burst fills/writes,
 +	 * which is something we need since we're trying to overlap the
 +	 * byte-swapping with the DMA operations.
  	 */
 -	err = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0,
 -	    BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL, MAXPHYS, 1,
 -	    MAXPHYS, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->dmatag);
 +	err = bus_dma_tag_create(bus_get_dma_tag(dev), 4096, 0,
 +	    BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL, 
 +	    BBSIZE, 1, BBSIZE, 0, NULL, NULL, &sc->dmatag);
  	if (err != 0)
  		goto out;
  
 -	err = bus_dmamap_create(sc->dmatag, 0,  &sc->map);
 -	if (err != 0)
 -		goto out;
 -
 -	at91_mci_fini(dev);
 -	at91_mci_init(dev);
 +	for (i = 0; i < BBCOUNT; ++i) {
 +		err = bus_dmamem_alloc(sc->dmatag, (void **)&sc->bbuf_vaddr[i],
 +		    BUS_DMA_NOWAIT, &sc->bbuf_map[i]);
 +		if (err != 0)
 +			goto out;
 +	}
  
  	/*
  	 * Activate the interrupt
 @@ -330,8 +461,15 @@ out:
  static int
  at91_mci_detach(device_t dev)
  {
 +	struct at91_mci_softc *sc = device_get_softc(dev);
 +
  	at91_mci_fini(dev);
  	at91_mci_deactivate(dev);
 +
 +	bus_dmamem_free(sc->dmatag, sc->bbuf_vaddr[0], sc->bbuf_map[0]);
 +	bus_dmamem_free(sc->dmatag, sc->bbuf_vaddr[1], sc->bbuf_map[1]);
 +	bus_dma_tag_destroy(sc->dmatag);
 +
  	return (EBUSY);	/* XXX */
  }
  
 @@ -398,14 +536,6 @@ at91_mci_is_mci1rev2xx(void)
  	}
  }
  
 -static void
 -at91_mci_getaddr(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
 -{
 -	if (error != 0)
 -		return;
 -	*(bus_addr_t *)arg = segs[0].ds_addr;
 -}
 -
  static int
  at91_mci_update_ios(device_t brdev, device_t reqdev)
  {
 @@ -437,7 +567,7 @@ at91_mci_update_ios(device_t brdev, devi
  		if (sc->use_30mhz && ios->clock == 25000000 &&
  		    at91_master_clock > 50000000)
  			clkdiv = 0;
 -                else if ((at91_master_clock % (ios->clock * 2)) == 0)
 +		else if ((at91_master_clock % (ios->clock * 2)) == 0)
  			clkdiv = ((at91_master_clock / ios->clock) / 2) - 1;
  		else
  			clkdiv = (at91_master_clock / ios->clock) / 2;
 @@ -456,73 +586,182 @@ at91_mci_update_ios(device_t brdev, devi
  static void
  at91_mci_start_cmd(struct at91_mci_softc *sc, struct mmc_command *cmd)
  {
 -	size_t len;
 -	uint32_t cmdr, ier = 0, mr;
 -	uint32_t *src, *dst;
 -	int i;
 +	uint32_t cmdr, mr;
  	struct mmc_data *data;
 -	void *vaddr;
 -	bus_addr_t paddr;
  
  	sc->curcmd = cmd;
  	data = cmd->data;
 -	cmdr = cmd->opcode;
  
  	/* XXX Upper layers don't always set this */
  	cmd->mrq = sc->req;
  
 +	/* Begin setting up command register. */
 +
 +	cmdr = cmd->opcode;
 +
 +	if (sc->host.ios.bus_mode == opendrain)
 +		cmdr |= MCI_CMDR_OPDCMD;
 +
 +	/* Set up response handling.  Allow max timeout for responses. */
 +
  	if (MMC_RSP(cmd->flags) == MMC_RSP_NONE)
  		cmdr |= MCI_CMDR_RSPTYP_NO;
  	else {
 -		/* Allow big timeout for responses */
  		cmdr |= MCI_CMDR_MAXLAT;
  		if (cmd->flags & MMC_RSP_136)
  			cmdr |= MCI_CMDR_RSPTYP_136;
  		else
  			cmdr |= MCI_CMDR_RSPTYP_48;
  	}
 -	if (cmd->opcode == MMC_STOP_TRANSMISSION)
 -		cmdr |= MCI_CMDR_TRCMD_STOP;
 -	if (sc->host.ios.bus_mode == opendrain)
 -		cmdr |= MCI_CMDR_OPDCMD;
 -	if (!data) {
 -		// The no data case is fairly simple
 +
 +	/*
 +	 * If there is no data transfer, just set up the right interrupt mask
 +	 * and start the command.
 +	 *
 +	 * The interrupt mask needs to be CMDRDY plus all non-data-transfer
 +	 * errors. It's important to leave the transfer-related errors out, to
 +	 * avoid spurious timeout or crc errors on a STOP command following a
 +	 * multiblock read.  When a multiblock read is in progress, sending a
 +	 * STOP in the middle of a block occasionally triggers such errors, but
 +	 * we're totally disinterested in them because we've already gotten all
 +	 * the data we wanted without error before sending the STOP command.
 +	 */
 +
 +	if (data == NULL) {
 +		uint32_t ier = MCI_SR_CMDRDY | 
 +		    MCI_SR_RTOE | MCI_SR_RENDE | 
 +		    MCI_SR_RCRCE | MCI_SR_RDIRE | MCI_SR_RINDE;
 +
  		at91_mci_pdc_disable(sc);
 -//		printf("CMDR %x ARGR %x\n", cmdr, cmd->arg);
 +
 +		if (cmd->opcode == MMC_STOP_TRANSMISSION)
 +			cmdr |= MCI_CMDR_TRCMD_STOP;
 +
 +		/* Ignore response CRC on CMD2 and ACMD41, per standard. */
 +
 +		if (cmd->opcode == MMC_SEND_OP_COND ||
 +		    cmd->opcode == ACMD_SD_SEND_OP_COND)
 +			ier &= ~MCI_SR_RCRCE;
 +
 +		if (mci_debug)
 +			printf("CMDR %x (opcode %d) ARGR %x no data\n", 
 +			    cmdr, cmd->opcode, cmd->arg);
 +
  		WR4(sc, MCI_ARGR, cmd->arg);
  		WR4(sc, MCI_CMDR, cmdr);
 -		WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_CMDRDY);
 +		WR4(sc, MCI_IDR, 0xffffffff);
 +		WR4(sc, MCI_IER, ier);
  		return;
  	}
 +
 +	/* There is data, set up the transfer-related parts of the command. */
 +
  	if (data->flags & MMC_DATA_READ)
  		cmdr |= MCI_CMDR_TRDIR;
 +
  	if (data->flags & (MMC_DATA_READ | MMC_DATA_WRITE))
  		cmdr |= MCI_CMDR_TRCMD_START;
 +
  	if (data->flags & MMC_DATA_STREAM)
  		cmdr |= MCI_CMDR_TRTYP_STREAM;
 -	if (data->flags & MMC_DATA_MULTI)
 +	else if (data->flags & MMC_DATA_MULTI) {
  		cmdr |= MCI_CMDR_TRTYP_MULTIPLE;
 -	// Set block size and turn on PDC mode for dma xfer and disable
 -	// PDC until we're ready.
 -	mr = RD4(sc, MCI_MR) & ~MCI_MR_BLKLEN;
 -	WR4(sc, MCI_MR, mr | (data->len << 16) | MCI_MR_PDCMODE);
 -	WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS);
 -	if (cmdr & MCI_CMDR_TRCMD_START) {
 -		len = data->len;
 -		if (cmdr & MCI_CMDR_TRDIR)
 -			vaddr = cmd->data->data;
 -		else {
 -			/* Use bounce buffer even if we don't need
 -			 * byteswap, since buffer may straddle a page
 -			 * boundry, and we don't handle multi-segment
 -			 * transfers in hardware.
 -			 * (page issues seen from 'bsdlabel -w' which
 -			 * uses raw geom access to the volume).
 -			 * Greg Ansley (gja (at) ansley.com)
 -			 */
 -			vaddr = sc->bounce_buffer;
 -			src = (uint32_t *)cmd->data->data;
 -			dst = (uint32_t *)vaddr;
 +		sc->flags |= (data->flags & MMC_DATA_READ) ? 
 +				CMD_MULTIREAD : CMD_MULTIWRITE;
 +	}
 +
 +	/*
 +	 * Disable PDC until we're ready.
 +	 *
 +	 * Set block size and turn on PDC mode for dma xfer.
 +	 * Note that the block size is the smaller of the amount of data to be
 +	 * transferred, or 512 bytes.  The 512 size is fixed by the standard;
 +	 * smaller blocks are possible, but never larger.
 +	 */
 +
 +	WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS); 
 +
 +	mr = RD4(sc,MCI_MR) & ~MCI_MR_BLKLEN; 
 +	mr |=  min(data->len, 512) << 16; 
 +	WR4(sc, MCI_MR, mr | MCI_MR_PDCMODE|MCI_MR_PDCPADV);
 +
 +	/*
 +	 * Set up DMA.
 +	 *
 +	 * Use bounce buffers even if we don't need to byteswap, because doing
 +	 * multi-block IO with large DMA buffers is way fast (compared to
 +	 * single-block IO), even after incurring the overhead of also copying
 +	 * from/to the caller's buffers (which may be in non-contiguous physical
 +	 * pages).
 +	 *
 +	 * In an ideal non-byteswap world we could create a dma tag that allows
 +	 * for discontiguous segments and do the IO directly from/to the
 +	 * caller's buffer(s), using ENDRX/ENDTX interrupts to chain the
 +	 * discontiguous buffers through the PDC. Someday.
 +	 *
 +	 * If a read is bigger than 2k, split it in half so that we can start
 +	 * byte-swapping the first half while the second half is on the wire.
 +	 * It would be best if we could split it into 8k chunks, but we can't
 +	 * always keep up with the byte-swapping due to other system activity,
 +	 * and if an RXBUFF interrupt happens while we're still handling the
 +	 * byte-swap from the prior buffer (IE, we haven't returned from
 +	 * handling the prior interrupt yet), then data will get dropped on the
 +	 * floor and we can't easily recover from that.  The right fix for that
 +	 * would be to have the interrupt handling only keep the DMA flowing and
 +	 * enqueue filled buffers to be byte-swapped in a non-interrupt context.
 +	 * Even that won't work on the write side of things though; in that
 +	 * context we have to have all the data ready to go before starting the
 +	 * dma.
 +	 *
 +	 * XXX what about stream transfers?
 +	 */
 +	sc->xfer_offset = 0;
 +	sc->bbuf_curidx = 0;
 +
 +	if (data->flags & (MMC_DATA_READ | MMC_DATA_WRITE)) {
 +		uint32_t len;
 +		uint32_t remaining = data->len;
 +		bus_addr_t paddr;
 +		int err;
 +
 +		if (remaining > (BBCOUNT*BBSIZE))
 +			panic("IO read size exceeds MAXDATA\n");
 +
 +		if (data->flags & MMC_DATA_READ) {
 +			if (remaining > 2048) // XXX
 +				len = remaining / 2;
 +			else
 +				len = remaining;
 +			err = bus_dmamap_load(sc->dmatag, sc->bbuf_map[0], 
 +			    sc->bbuf_vaddr[0], len, at91_mci_getaddr, 
 +			    &paddr, BUS_DMA_NOWAIT);
 +			if (err != 0)
 +				panic("IO read dmamap_load failed\n");
 +			bus_dmamap_sync(sc->dmatag, sc->bbuf_map[0], 
 +			    BUS_DMASYNC_PREREAD);
 +			WR4(sc, PDC_RPR, paddr);
 +			WR4(sc, PDC_RCR, len / 4);
 +			sc->bbuf_len[0] = len;
 +			remaining -= len;
 +			if (remaining == 0) {
 +				sc->bbuf_len[1] = 0;
 +			} else {
 +				len = remaining;
 +				err = bus_dmamap_load(sc->dmatag, sc->bbuf_map[1], 
 +				    sc->bbuf_vaddr[1], len, at91_mci_getaddr, 
 +				    &paddr, BUS_DMA_NOWAIT);
 +				if (err != 0)
 +					panic("IO read dmamap_load failed\n");
 +				bus_dmamap_sync(sc->dmatag, sc->bbuf_map[1], 
 +				    BUS_DMASYNC_PREREAD);
 +				WR4(sc, PDC_RNPR, paddr);
 +				WR4(sc, PDC_RNCR, len / 4);
 +				sc->bbuf_len[1] = len;
 +				remaining -= len;
 +			}
 +			WR4(sc, PDC_PTCR, PDC_PTCR_RXTEN);
 +		} else {
 +			len = min(BBSIZE, remaining);
  			/*
  			 * If this is MCI1 revision 2xx controller, apply
  			 * a work-around for the "Data Write Operation and
 @@ -530,74 +769,75 @@ at91_mci_start_cmd(struct at91_mci_softc
  			 */
  			if (at91_mci_is_mci1rev2xx() && data->len < 12) {
  				len = 12;
 -				memset(dst, 0, 12);
 +				memset(data->data, 0, 12);
  			}
 -			if (sc->sc_cap & CAP_NEEDS_BYTESWAP) {
 -				for (i = 0; i < data->len / 4; i++)
 -					dst[i] = bswap32(src[i]);
 -			} else
 -				memcpy(dst, src, data->len);
 -		}
 -		data->xfer_len = 0;
 -		if (bus_dmamap_load(sc->dmatag, sc->map, vaddr, len,
 -		    at91_mci_getaddr, &paddr, 0) != 0) {
 -			cmd->error = MMC_ERR_NO_MEMORY;
 -			sc->req = NULL;
 -			sc->curcmd = NULL;
 -			cmd->mrq->done(cmd->mrq);
 -			return;
 -		}
 -		sc->mapped++;
 -		if (cmdr & MCI_CMDR_TRDIR) {
 -			bus_dmamap_sync(sc->dmatag, sc->map, BUS_DMASYNC_PREREAD);
 -			WR4(sc, PDC_RPR, paddr);
 -			WR4(sc, PDC_RCR, len / 4);
 -			ier = MCI_SR_ENDRX;
 -		} else {
 -			bus_dmamap_sync(sc->dmatag, sc->map, BUS_DMASYNC_PREWRITE);
 -			WR4(sc, PDC_TPR, paddr);
 +			at91_bswap_buf(sc, sc->bbuf_vaddr[0], data->data, len);
 +			err = bus_dmamap_load(sc->dmatag, sc->bbuf_map[0], 
 +			    sc->bbuf_vaddr[0], len, at91_mci_getaddr, 
 +			    &paddr, BUS_DMA_NOWAIT);
 +			if (err != 0)
 +				panic("IO write dmamap_load failed\n");
 +			bus_dmamap_sync(sc->dmatag, sc->bbuf_map[0], 
 +			    BUS_DMASYNC_PREWRITE);
 +			WR4(sc, PDC_TPR,paddr);
  			WR4(sc, PDC_TCR, len / 4);
 -			ier = MCI_SR_TXBUFE;
 +			sc->bbuf_len[0] = len;
 +			remaining -= len;
 +			if (remaining == 0) {
 +				sc->bbuf_len[1] = 0;
 +			} else {
 +				len = remaining;
 +				at91_bswap_buf(sc, sc->bbuf_vaddr[1],
 +				    ((char *)data->data)+BBSIZE, len);
 +				err = bus_dmamap_load(sc->dmatag, sc->bbuf_map[1], 
 +				    sc->bbuf_vaddr[1], len, at91_mci_getaddr, 
 +				    &paddr, BUS_DMA_NOWAIT);
 +				if (err != 0)
 +					panic("IO write dmamap_load failed\n");
 +				bus_dmamap_sync(sc->dmatag, sc->bbuf_map[1], 
 +				    BUS_DMASYNC_PREWRITE);
 +				WR4(sc, PDC_TNPR, paddr);
 +				WR4(sc, PDC_TNCR, len / 4);
 +				sc->bbuf_len[1] = len;
 +				remaining -= len;
 +			}
 +			/* do not enable PDC xfer until CMDRDY asserted */
  		}
 +		data->xfer_len = 0; /* XXX what's this? appears to be unused. */
  	}
 -//	printf("CMDR %x ARGR %x with data\n", cmdr, cmd->arg);
 +
 +	if (mci_debug)
 +		printf("CMDR %x (opcode %d) ARGR %x with data len %d\n", 
 +		       cmdr, cmd->opcode, cmd->arg, cmd->data->len);
 +
  	WR4(sc, MCI_ARGR, cmd->arg);
 -	if (cmdr & MCI_CMDR_TRCMD_START) {
 -		if (cmdr & MCI_CMDR_TRDIR) {
 -			WR4(sc, PDC_PTCR, PDC_PTCR_RXTEN);
 -			WR4(sc, MCI_CMDR, cmdr);
 -		} else {
 -			WR4(sc, MCI_CMDR, cmdr);
 -			WR4(sc, PDC_PTCR, PDC_PTCR_TXTEN);
 -		}
 -	}
 -	WR4(sc, MCI_IER, MCI_SR_ERROR | ier);
 +	WR4(sc, MCI_CMDR, cmdr);
 +	WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_CMDRDY);
  }
  
  static void
 -at91_mci_start(struct at91_mci_softc *sc)
 +at91_mci_next_operation(struct at91_mci_softc *sc)
  {
  	struct mmc_request *req;
  
  	req = sc->req;
  	if (req == NULL)
  		return;
 -	// assert locked
 -	if (!(sc->flags & CMD_STARTED)) {
 -		sc->flags |= CMD_STARTED;
 -//		printf("Starting CMD\n");
 +
 +	if (sc->flags & PENDING_CMD) {
 +		sc->flags &= ~PENDING_CMD;
  		at91_mci_start_cmd(sc, req->cmd);
  		return;
 -	}
 -	if (!(sc->flags & STOP_STARTED) && req->stop) {
 -//		printf("Starting Stop\n");
 -		sc->flags |= STOP_STARTED;
 +	} else if (sc->flags & PENDING_STOP) {
 +		sc->flags &= ~PENDING_STOP;
  		at91_mci_start_cmd(sc, req->stop);
  		return;
  	}
 -	/* We must be done -- bad idea to do this while locked? */
 +
 +	WR4(sc, MCI_IDR, 0xffffffff);
  	sc->req = NULL;
  	sc->curcmd = NULL;
 +	//printf("req done\n");
  	req->done(req);
  }
  
 @@ -607,16 +847,16 @@ at91_mci_request(device_t brdev, device_
  	struct at91_mci_softc *sc = device_get_softc(brdev);
  
  	AT91_MCI_LOCK(sc);
 -	// XXX do we want to be able to queue up multiple commands?
 -	// XXX sounds like a good idea, but all protocols are sync, so
 -	// XXX maybe the idea is naive...
  	if (sc->req != NULL) {
  		AT91_MCI_UNLOCK(sc);
  		return (EBUSY);
  	}
 +	//printf("new req\n");
  	sc->req = req;
 -	sc->flags = 0;
 -	at91_mci_start(sc);
 +	sc->flags = PENDING_CMD;
 +	if (sc->req->stop)
 +		sc->flags |= PENDING_STOP;
 +	at91_mci_next_operation(sc);
  	AT91_MCI_UNLOCK(sc);
  	return (0);
  }
 @@ -654,120 +894,351 @@ at91_mci_release_host(device_t brdev, de
  }
  
  static void
 -at91_mci_read_done(struct at91_mci_softc *sc)
 +at91_mci_read_done(struct at91_mci_softc *sc, uint32_t sr)
  {
 -	uint32_t *walker;
 -	struct mmc_command *cmd;
 -	int i, len;
 -
 -	cmd = sc->curcmd;
 -	bus_dmamap_sync(sc->dmatag, sc->map, BUS_DMASYNC_POSTREAD);
 -	bus_dmamap_unload(sc->dmatag, sc->map);
 -	sc->mapped--;
 -	if (sc->sc_cap & CAP_NEEDS_BYTESWAP) {
 -		walker = (uint32_t *)cmd->data->data;
 -		len = cmd->data->len / 4;
 -		for (i = 0; i < len; i++)
 -			walker[i] = bswap32(walker[i]);
 -	}
 -	// Finish up the sequence...
 -	WR4(sc, MCI_IDR, MCI_SR_ENDRX);
 -	WR4(sc, MCI_IER, MCI_SR_RXBUFF);
 -	WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS);
 +	struct mmc_command *cmd = sc->curcmd;
 +	char * dataptr = (char *)cmd->data->data;
 +	uint32_t curidx = sc->bbuf_curidx;
 +	uint32_t len = sc->bbuf_len[curidx];
 +
 +	/*
 +	 * We arrive here when a DMA transfer for a read is done, whether it's
 +	 * a single or multi-block read.
 +	 *
 +	 * We byte-swap the buffer that just completed, and if that is the
 +	 * last buffer that's part of this read then we move on to the next
 +	 * operation, otherwise we wait for another ENDRX for the next bufer.
 +	 */
 +
 +	bus_dmamap_sync(sc->dmatag, sc->bbuf_map[curidx], BUS_DMASYNC_POSTREAD);
 +	bus_dmamap_unload(sc->dmatag, sc->bbuf_map[curidx]);
 +
 +	at91_bswap_buf(sc, dataptr + sc->xfer_offset, sc->bbuf_vaddr[curidx], len);
 +
 +	if (mci_debug) {
 +		printf("read done sr %x curidx %d len %d xfer_offset %d\n",
 +		       sr, curidx, len, sc->xfer_offset);
 +	}
 +
 +	sc->xfer_offset += len;
 +	sc->bbuf_curidx = !curidx; /* swap buffers */
 +
 +	/*
 +	 * If we've transferred all the data, move on to the next operation.
 +	 *
 +	 * If we're still transferring the last buffer, RNCR is already zero but
 +	 * we have to write a zero anyway to clear the ENDRX status so we don't
 +	 * re-interrupt until the last buffer is done.
 +	 */
 +	if (sc->xfer_offset == cmd->data->len) {
 +		WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS);
 +		cmd->error = MMC_ERR_NONE;
 +		at91_mci_next_operation(sc);
 +	} else {
 +		WR4(sc, PDC_RNCR, 0);
 +		WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_ENDRX);
 +	}
  }
  
  static void
 -at91_mci_xmit_done(struct at91_mci_softc *sc)
 +at91_mci_write_done(struct at91_mci_softc *sc, uint32_t sr)
  {
 -	// Finish up the sequence...
 +	struct mmc_command *cmd = sc->curcmd;
 +
 +	/*
 +	 * We arrive here when the entire DMA transfer for a write is done,
 +	 * whether it's a single or multi-block write.  If it's multi-block we
 +	 * have to immediately move on to the next operation which is to send
 +	 * the stop command.  If it's a single-block transfer we need to wait
 +	 * for NOTBUSY, but if that's already asserted we can avoid another
 +	 * interrupt and just move on to completing the request right away.
 +	 */
 +
  	WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS);
 -	WR4(sc, MCI_IDR, MCI_SR_TXBUFE);
 -	WR4(sc, MCI_IER, MCI_SR_NOTBUSY);
 -	bus_dmamap_sync(sc->dmatag, sc->map, BUS_DMASYNC_POSTWRITE);
 -	bus_dmamap_unload(sc->dmatag, sc->map);
 -	sc->mapped--;
 +
 +	bus_dmamap_sync(sc->dmatag, sc->bbuf_map[sc->bbuf_curidx],
 +	    BUS_DMASYNC_POSTWRITE);
 +	bus_dmamap_unload(sc->dmatag, sc->bbuf_map[sc->bbuf_curidx]);
 +
 +	if ((cmd->data->flags & MMC_DATA_MULTI) || (sr & MCI_SR_NOTBUSY)) {
 +		cmd->error = MMC_ERR_NONE;
 +		at91_mci_next_operation(sc);
 +	} else {
 +		WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_NOTBUSY);
 +	}
 +}
 +
 +static void
 +at91_mci_notbusy(struct at91_mci_softc *sc)
 +{
 +	struct mmc_command *cmd = sc->curcmd;
 +
 +	/*
 +	 * We arrive here by either completion of a single-block write, or
 +	 * completion of the stop command that ended a multi-block write (and,
 +	 * I suppose, after a card-select or erase, but I haven't tested
 +	 * those).  Anyway, we're done and it's time to move on to the next
 +	 * command.
 +	 */
 +
 +	cmd->error = MMC_ERR_NONE;
 +	at91_mci_next_operation(sc);
 +}
 +
 +static void
 +at91_mci_stop_done(struct at91_mci_softc *sc, uint32_t sr)
 +{
 +	struct mmc_command *cmd = sc->curcmd;
 +
 +	/*
 +	 * We arrive here after receiving CMDRDY for a MMC_STOP_TRANSMISSION
 +	 * command.  Depending on the operation being stopped, we may have to
 +	 * do some unusual things to work around hardware bugs.
 +	 */
 +
 +	/*
 +	 * This is known to be true of at91rm9200 hardware; it may or may not
 +	 * apply to more recent chips: 
 +	 *
 +	 * After stopping a multi-block write, the NOTBUSY bit in MCI_SR does
 +	 * not properly reflect the actual busy state of the card as signaled
 +	 * on the DAT0 line; it always claims the card is not-busy.  If we
 +	 * believe that and let operations continue, following commands will
 +	 * fail with response timeouts (except of course MMC_SEND_STATUS -- it
 +	 * indicates the card is busy in the PRG state, which was the smoking
 +	 * gun that showed MCI_SR NOTBUSY was not tracking DAT0 correctly).
 +	 *
 +	 * The atmel docs are emphatic: "This flag [NOTBUSY] must be used only
 +	 * for Write Operations."  I guess technically since we sent a stop
 +	 * it's not a write operation anymore.  But then just what did they
 +	 * think it meant for the stop command to have "...an optional busy
 +	 * signal transmitted on the data line" according to the SD spec?
 +	 *
 +	 * I tried a variety of things to un-wedge the MCI and get the status
 +	 * register to reflect NOTBUSY correctly again, but the only thing
 +	 * that worked was a full device reset.  It feels like an awfully big
 +	 * hammer, but doing a full reset after every multiblock write is
 +	 * still faster than doing single-block IO (by almost two orders of
 +	 * magnitude: 20KB/sec improves to about 1.8MB/sec best case).
 +	 *
 +	 * After doing the reset, wait for a NOTBUSY interrupt before
 +	 * continuing with the next operation.
 +	 */
 +	if (sc->flags & CMD_MULTIWRITE) {
 +		at91_mci_reset(sc);
 +		WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_NOTBUSY);
 +		return;
 +	}
 +
 +	/*
 +	 * This is known to be true of at91rm9200 hardware; it may or may not
 +	 * apply to more recent chips:
 +	 *
 +	 * After stopping a multi-block read, loop to read and discard any
 +	 * data that coasts in after we sent the stop command.  The docs don't
 +	 * say anything about it, but empirical testing shows that 1-3
 +	 * additional words of data get buffered up in some unmentioned
 +	 * internal fifo and if we don't read and discard them here they end
 +	 * up on the front of the next read DMA transfer we do.
 +	 */
 +	if (sc->flags & CMD_MULTIREAD) {
 +		uint32_t sr;
 +		int count = 0;
 +
 +		do {
 +			sr = RD4(sc, MCI_SR);
 +			if (sr & MCI_SR_RXRDY) {
 +				RD4(sc,  MCI_RDR);
 +				++count;
 +			}
 +		} while (sr & MCI_SR_RXRDY);
 +		at91_mci_reset(sc);
 +//              if (count != 0)
 +//                      printf("Had to soak up %d words after read\n", count);
 +	}
 +
 +	cmd->error = MMC_ERR_NONE;
 +	at91_mci_next_operation(sc);
 +
 +}
 +
 +static void
 +at91_mci_cmdrdy(struct at91_mci_softc *sc, uint32_t sr)
 +{
 +	struct mmc_command *cmd = sc->curcmd;
 +	int i;
 +
 +	if (cmd == NULL)
 +		return;
 +
 +	/*
 +	 * We get here at the end of EVERY command.  We retrieve the command
 +	 * response (if any) then decide what to do next based on the command.
 +	 */
 +
 +	if (cmd->flags & MMC_RSP_PRESENT) {
 +		for (i = 0; i < ((cmd->flags & MMC_RSP_136) ? 4 : 1); i++) {
 +			cmd->resp[i] = RD4(sc, MCI_RSPR + i * 4);
 +			if (mci_debug)
 +				printf("RSPR[%d] = %x sr=%x\n", i, cmd->resp[i],  sr);
 +		}
 +	}
 +
 +	/*
 +	 * If this was a stop command, go handle the various special
 +	 * conditions (read: bugs) that have to be dealt with following a stop.
 +	 */
 +	if (cmd->opcode == MMC_STOP_TRANSMISSION) {
 +		at91_mci_stop_done(sc, sr);
 +		return;
 +	}
 +
 +	/*
 +	 * If this command can continue to assert BUSY beyond the response then
 +	 * we need to wait for NOTBUSY before the command is really done.
 +	 *
 +	 * Note that this may not work properly on the at91rm9200.  It certainly
 +	 * doesn't work for the STOP command that follows a multi-block write,
 +	 * so post-stop CMDRDY is handled separately; see the special handling
 +	 * in at91_mci_stop_done().
 +	 *
 +	 * Beside STOP, there are other R1B-type commands that use the busy
 +	 * signal after CMDRDY: CMD7 (card select), CMD28-29 (write protect),
 +	 * CMD38 (erase). I haven't tested any of them, but I rather expect
 +	 * them all to have the same sort of problem with MCI_SR not actually
 +	 * reflecting the state of the DAT0-line busy indicator.  So this code
 +	 * may need to grow some sort of special handling for them too. (This
 +	 * just in: CMD7 isn't a problem right now because dev/mmc.c incorrectly
 +	 * sets the response flags to R1 rather than R1B.) XXX
 +	 */
 +	if ((cmd->flags & MMC_RSP_BUSY)) {
 +		WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_NOTBUSY);
 +		return;
 +	}
 +
 +	/*
 +	 * If there is a data transfer with this command, then...
 +	 * - If it's a read, we need to wait for ENDRX.
 +	 * - If it's a write, now is the time to enable the PDC, and we need
 +	 *   to wait for a BLKE that follows a TXBUFE, because if we're doing
 +	 *   a split transfer we get a BLKE after the first half (when TPR/TCR
 +	 *   get loaded from TNPR/TNCR).  So first we wait for the TXBUFE, and
 +	 *   the handling for that interrupt will then invoke the wait for the
 +	 *   subsequent BLKE which indicates actual completion.
 +	 */
 +	if (cmd->data) {
 +		uint32_t ier;
 +		if (cmd->data->flags & MMC_DATA_READ) {
 +			ier = MCI_SR_ENDRX;
 +		} else {
 +			ier = MCI_SR_TXBUFE;
 +			WR4(sc, PDC_PTCR, PDC_PTCR_TXTEN);
 +		}
 +		WR4(sc, MCI_IER, MCI_SR_ERROR | ier);
 +		return;
 +	}
 +
 +	/*
 +	 * If we made it to here, we don't need to wait for anything more for
 +	 * the current command, move on to the next command (will complete the
 +	 * request if there is no next command).
 +	 */
 +	cmd->error = MMC_ERR_NONE;
 +	at91_mci_next_operation(sc);
  }
  
  static void
  at91_mci_intr(void *arg)
  {
  	struct at91_mci_softc *sc = (struct at91_mci_softc*)arg;
 -	uint32_t sr;
 -	int i, done = 0;
 -	struct mmc_command *cmd;
 +	struct mmc_command *cmd = sc->curcmd;
 +	uint32_t sr, isr;
  
  	AT91_MCI_LOCK(sc);
 -	sr = RD4(sc, MCI_SR) & RD4(sc, MCI_IMR);
 -//	printf("i 0x%x\n", sr);
 -	cmd = sc->curcmd;
 -	if (sr & MCI_SR_ERROR) {
 -		// Ignore CRC errors on CMD2 and ACMD47, per relevant standards
 -		if ((sr & MCI_SR_RCRCE) && (cmd->opcode == MMC_SEND_OP_COND ||
 -		    cmd->opcode == ACMD_SD_SEND_OP_COND))
 -			cmd->error = MMC_ERR_NONE;
 -		else if (sr & (MCI_SR_RTOE | MCI_SR_DTOE))
 +
 +	sr = RD4(sc, MCI_SR);
 +	isr = sr & RD4(sc, MCI_IMR);
 +
 +	if (mci_debug)
 +		printf("i 0x%x sr 0x%x\n", isr, sr);
 +
 +	/*
 +	 * All interrupts are one-shot; disable it now.
 +	 * The next operation will re-enable whatever interrupts it wants.
 +	 */
 +	WR4(sc, MCI_IDR, isr);
 +	if (isr & MCI_SR_ERROR) {
 +		if (isr & (MCI_SR_RTOE | MCI_SR_DTOE))
  			cmd->error = MMC_ERR_TIMEOUT;
 -		else if (sr & (MCI_SR_RCRCE | MCI_SR_DCRCE))
 +		else if (isr & (MCI_SR_RCRCE | MCI_SR_DCRCE))
  			cmd->error = MMC_ERR_BADCRC;
 -		else if (sr & (MCI_SR_OVRE | MCI_SR_UNRE))
 +		else if (isr & (MCI_SR_OVRE | MCI_SR_UNRE))
  			cmd->error = MMC_ERR_FIFO;
  		else
  			cmd->error = MMC_ERR_FAILED;
 -		done = 1;
 -		if (sc->mapped && cmd->error) {
 -			bus_dmamap_unload(sc->dmatag, sc->map);
 -			sc->mapped--;
 +		/*
 +		 * CMD8 is used to probe for SDHC cards, a standard SD card
 +		 * will get a response timeout; don't report it because it's a
 +		 * normal and expected condition.  One might argue that all
 +		 * error reporting should be left to higher levels, but when
 +		 * they report at all it's always EIO, which isn't very
 +		 * helpful. XXX bootverbose?
 +		 */
 +		if (cmd->opcode != 8) {
 +			device_printf(sc->dev, 
 
 *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
 _______________________________________________
 svn-src-all@freebsd.org mailing list
 http://lists.freebsd.org/mailman/listinfo/svn-src-all
 To unsubscribe, send any mail to "svn-src-all-unsubscribe@freebsd.org"