Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 28 Aug 2012 01:28:53 +0000 (UTC)
From:      Warner Losh <imp@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r239762 - head/sys/arm/at91
Message-ID:  <201208280128.q7S1SriX088038@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: imp
Date: Tue Aug 28 01:28:52 2012
New Revision: 239762
URL: http://svn.freebsd.org/changeset/base/239762

Log:
  Bring in the multi-block patches for mci.  These required extensive
  restructuring of the driver.  I've tried to preserve the other silicon
  workarounds that we've added over the years, but haven't had a chance
  to extensively test on other hardware.  On my AT91RM9200 with 30MHz/1
  wire/64 block transfers, I've been able to go from ~.66MB/s to
  2.25MB/s in the simple tests I performed, almost a 3.5x improvement.
  This cuts the boot time almost in half when everything else goes
  right (timed from rtc message to login: prompt).
  
  PR:		155214
  Submitted by:	Ian Lapore

Modified:
  head/sys/arm/at91/at91_mci.c

Modified: head/sys/arm/at91/at91_mci.c
==============================================================================
--- head/sys/arm/at91/at91_mci.c	Mon Aug 27 23:27:41 2012	(r239761)
+++ head/sys/arm/at91/at91_mci.c	Tue Aug 28 01:28:52 2012	(r239762)
@@ -114,7 +114,24 @@ __FBSDID("$FreeBSD$");
 #define AT91_MCI_USE_30MHZ 1
 #endif
 
-#define BBSZ	512
+/*
+ * Allocate 2 bounce buffers we'll use to endian-swap the data due to the rm9200
+ * erratum.  We use a pair of buffers because when reading that lets us begin
+ * endian-swapping the data in the first buffer while the DMA is reading into
+ * the second buffer.  (We can't use the same trick for writing because we might
+ * not get all the data in the 2nd buffer swapped before the hardware needs it;
+ * dealing with that would add complexity to the driver.)
+ *
+ * The buffers are sized at 16K each due to the way the busdma cache sync
+ * operations work on arm.  A dcache_inv_range() operation on a range larger
+ * than 16K gets turned into a dcache_wbinv_all().  That needlessly flushes the
+ * entire data cache, impacting overall system performance.
+ */
+#define BBCOUNT     2
+#define BBSIZE      (16*1024)
+#define MAX_BLOCKS  ((BBSIZE*BBCOUNT)/512)
+
+static int mci_debug;
 
 struct at91_mci_softc {
 	void *intrhand;			/* Interrupt handle */
@@ -123,21 +140,25 @@ struct at91_mci_softc {
 #define	CAP_HAS_4WIRE		1	/* Has 4 wire bus */
 #define	CAP_NEEDS_BYTESWAP	2	/* broken hardware needing bounce */
 	int flags;
-#define CMD_STARTED	1
-#define STOP_STARTED	2
+#define PENDING_CMD	0x01
+#define PENDING_STOP	0x02
+#define CMD_MULTIREAD	0x10
+#define CMD_MULTIWRITE	0x20
 	int has_4wire;
 	int use_30mhz;
 	struct resource *irq_res;	/* IRQ resource */
 	struct resource	*mem_res;	/* Memory resource */
 	struct mtx sc_mtx;
 	bus_dma_tag_t dmatag;
-	bus_dmamap_t map;
-	int mapped;
 	struct mmc_host host;
 	int bus_busy;
 	struct mmc_request *req;
 	struct mmc_command *curcmd;
-	char bounce_buffer[BBSZ];
+	bus_dmamap_t bbuf_map[BBCOUNT];
+	char      *  bbuf_vaddr[BBCOUNT]; /* bounce bufs in KVA space */
+	uint32_t     bbuf_len[BBCOUNT];	  /* len currently queued for bounce buf */
+	uint32_t     bbuf_curidx;	  /* which bbuf is the active DMA buffer */
+	uint32_t     xfer_offset;	  /* offset so far into caller's buf */
 };
 
 static inline uint32_t
@@ -172,6 +193,51 @@ static int at91_mci_is_mci1rev2xx(void);
 #define AT91_MCI_ASSERT_LOCKED(_sc)	mtx_assert(&_sc->sc_mtx, MA_OWNED);
 #define AT91_MCI_ASSERT_UNLOCKED(_sc) mtx_assert(&_sc->sc_mtx, MA_NOTOWNED);
 
+static void 
+at91_bswap_buf(struct at91_mci_softc *sc, void * dptr, void * sptr, uint32_t memsize)
+{
+	uint32_t * dst = (uint32_t *)dptr;
+	uint32_t * src = (uint32_t *)sptr;
+	uint32_t   i;
+
+	/*
+	 * If the hardware doesn't need byte-swapping, let bcopy() do the
+	 * work.  Use bounce buffer even if we don't need byteswap, since
+	 * buffer may straddle a page boundry, and we don't handle
+	 * multi-segment transfers in hardware.  Seen from 'bsdlabel -w' which
+	 * uses raw geom access to the volume.  Greg Ansley (gja (at)
+	 * ansley.com)
+	 */
+	if (!(sc->sc_cap & CAP_NEEDS_BYTESWAP)) {
+		bcopy(dptr, sptr, memsize);
+		return;
+	}
+
+	/*
+	 * Nice performance boost for slightly unrolling this loop.
+	 * (But very little extra boost for further unrolling it.)
+	 */
+	for (i = 0; i < memsize; i += 16) {
+		*dst++ = bswap32(*src++);
+		*dst++ = bswap32(*src++);
+		*dst++ = bswap32(*src++);
+		*dst++ = bswap32(*src++);
+	}
+
+	/* Mop up the last 1-3 words, if any. */
+	for (i = 0; i < (memsize & 0x0F); i += 4) {
+		*dst++ = bswap32(*src++);
+	}
+}
+
+static void
+at91_mci_getaddr(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
+{
+	if (error != 0)
+		return;
+	*(bus_addr_t *)arg = segs[0].ds_addr;
+}
+
 static void
 at91_mci_pdc_disable(struct at91_mci_softc *sc)
 {
@@ -186,13 +252,57 @@ at91_mci_pdc_disable(struct at91_mci_sof
 	WR4(sc, PDC_TNCR, 0);
 }
 
+/*
+ * Reset the controller, then restore most of the current state.
+ *
+ * This is called after detecting an error.  It's also called after stopping a
+ * multi-block write, to un-wedge the device so that it will handle the NOTBUSY
+ * signal correctly.  See comments in at91_mci_stop_done() for more details.
+ */
+static void at91_mci_reset(struct at91_mci_softc *sc)
+{
+	uint32_t mr;
+	uint32_t sdcr;
+	uint32_t dtor;
+	uint32_t imr;
+
+	at91_mci_pdc_disable(sc);
+
+	/* save current state */
+
+	imr  = RD4(sc, MCI_IMR);
+	mr   = RD4(sc, MCI_MR) & 0x7fff;
+	sdcr = RD4(sc, MCI_SDCR);
+	dtor = RD4(sc, MCI_DTOR);
+
+	/* reset the controller */
+
+	WR4(sc, MCI_IDR, 0xffffffff);
+	WR4(sc, MCI_CR, MCI_CR_MCIDIS | MCI_CR_SWRST);
+
+	/* restore state */
+
+	WR4(sc, MCI_CR, MCI_CR_MCIEN|MCI_CR_PWSEN);
+	WR4(sc, MCI_MR, mr);
+	WR4(sc, MCI_SDCR, sdcr);
+	WR4(sc, MCI_DTOR, dtor);
+	WR4(sc, MCI_IER, imr);
+
+	/*
+	 * Make sure sdio interrupts will fire.  Not sure why reading
+	 * SR ensures that, but this is in the linux driver.
+	 */
+
+	RD4(sc, MCI_SR);
+}
+
 static void
 at91_mci_init(device_t dev)
 {
 	struct at91_mci_softc *sc = device_get_softc(dev);
 	uint32_t val;
 
-	WR4(sc, MCI_CR, MCI_CR_MCIEN);		/* Enable controller */
+	WR4(sc, MCI_CR, MCI_CR_MCIDIS | MCI_CR_SWRST); /* device into reset */
 	WR4(sc, MCI_IDR, 0xffffffff);		/* Turn off interrupts */
 	WR4(sc, MCI_DTOR, MCI_DTOR_DTOMUL_1M | 1);
 	val = MCI_MR_PDCMODE;
@@ -203,10 +313,19 @@ at91_mci_init(device_t dev)
 #ifndef  AT91_MCI_SLOT_B
 	WR4(sc, MCI_SDCR, 0);			/* SLOT A, 1 bit bus */
 #else
-	/* XXX Really should add second "unit" but nobody using using
-	 * a two slot card that we know of. -- except they are... XXX */
+	/*
+	 * XXX Really should add second "unit" but nobody using using 
+	 * a two slot card that we know of. XXX
+	 */
 	WR4(sc, MCI_SDCR, 1);			/* SLOT B, 1 bit bus */
 #endif
+	/*
+	 * Enable controller, including power-save.  The slower clock
+	 * of the power-save mode is only in effect when there is no
+	 * transfer in progress, so it can be left in this mode all
+	 * the time.
+	 */
+	WR4(sc, MCI_CR, MCI_CR_MCIEN|MCI_CR_PWSEN);
 }
 
 static void
@@ -216,7 +335,7 @@ at91_mci_fini(device_t dev)
 
 	WR4(sc, MCI_IDR, 0xffffffff);		/* Turn off interrupts */
 	at91_mci_pdc_disable(sc);
-	WR4(sc, MCI_CR, MCI_CR_MCIDIS | MCI_CR_SWRST); /* Put the device into reset */
+	WR4(sc, MCI_CR, MCI_CR_MCIDIS | MCI_CR_SWRST); /* device into reset */
 }
 
 static int
@@ -234,7 +353,7 @@ at91_mci_attach(device_t dev)
 	struct sysctl_ctx_list *sctx;
 	struct sysctl_oid *soid;
 	device_t child;
-	int err;
+	int err, i;
 
 	sctx = device_get_sysctl_ctx(dev);
 	soid = device_get_sysctl_tree(dev);
@@ -249,21 +368,33 @@ at91_mci_attach(device_t dev)
 
 	AT91_MCI_LOCK_INIT(sc);
 
+	at91_mci_fini(dev);
+	at91_mci_init(dev);
+
 	/*
-	 * Allocate DMA tags and maps
+	 * Allocate DMA tags and maps and bounce buffers.
+	 *
+	 * The parms in the tag_create call cause the dmamem_alloc call to
+	 * create each bounce buffer as a single contiguous buffer of BBSIZE
+	 * bytes aligned to a 4096 byte boundary.
+	 *
+	 * Do not use DMA_COHERENT for these buffers because that maps the
+	 * memory as non-cachable, which prevents cache line burst fills/writes,
+	 * which is something we need since we're trying to overlap the
+	 * byte-swapping with the DMA operations.
 	 */
-	err = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0,
-	    BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL, MAXPHYS, 1,
-	    MAXPHYS, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->dmatag);
+	err = bus_dma_tag_create(bus_get_dma_tag(dev), 4096, 0,
+	    BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL, 
+	    BBSIZE, 1, BBSIZE, 0, NULL, NULL, &sc->dmatag);
 	if (err != 0)
 		goto out;
 
-	err = bus_dmamap_create(sc->dmatag, 0,  &sc->map);
-	if (err != 0)
-		goto out;
-
-	at91_mci_fini(dev);
-	at91_mci_init(dev);
+	for (i = 0; i < BBCOUNT; ++i) {
+		err = bus_dmamem_alloc(sc->dmatag, (void **)&sc->bbuf_vaddr[i],
+		    BUS_DMA_NOWAIT, &sc->bbuf_map[i]);
+		if (err != 0)
+			goto out;
+	}
 
 	/*
 	 * Activate the interrupt
@@ -330,8 +461,15 @@ out:
 static int
 at91_mci_detach(device_t dev)
 {
+	struct at91_mci_softc *sc = device_get_softc(dev);
+
 	at91_mci_fini(dev);
 	at91_mci_deactivate(dev);
+
+	bus_dmamem_free(sc->dmatag, sc->bbuf_vaddr[0], sc->bbuf_map[0]);
+	bus_dmamem_free(sc->dmatag, sc->bbuf_vaddr[1], sc->bbuf_map[1]);
+	bus_dma_tag_destroy(sc->dmatag);
+
 	return (EBUSY);	/* XXX */
 }
 
@@ -398,14 +536,6 @@ at91_mci_is_mci1rev2xx(void)
 	}
 }
 
-static void
-at91_mci_getaddr(void *arg, bus_dma_segment_t *segs, int nsegs, int error)
-{
-	if (error != 0)
-		return;
-	*(bus_addr_t *)arg = segs[0].ds_addr;
-}
-
 static int
 at91_mci_update_ios(device_t brdev, device_t reqdev)
 {
@@ -437,7 +567,7 @@ at91_mci_update_ios(device_t brdev, devi
 		if (sc->use_30mhz && ios->clock == 25000000 &&
 		    at91_master_clock > 50000000)
 			clkdiv = 0;
-                else if ((at91_master_clock % (ios->clock * 2)) == 0)
+		else if ((at91_master_clock % (ios->clock * 2)) == 0)
 			clkdiv = ((at91_master_clock / ios->clock) / 2) - 1;
 		else
 			clkdiv = (at91_master_clock / ios->clock) / 2;
@@ -456,73 +586,182 @@ at91_mci_update_ios(device_t brdev, devi
 static void
 at91_mci_start_cmd(struct at91_mci_softc *sc, struct mmc_command *cmd)
 {
-	size_t len;
-	uint32_t cmdr, ier = 0, mr;
-	uint32_t *src, *dst;
-	int i;
+	uint32_t cmdr, mr;
 	struct mmc_data *data;
-	void *vaddr;
-	bus_addr_t paddr;
 
 	sc->curcmd = cmd;
 	data = cmd->data;
-	cmdr = cmd->opcode;
 
 	/* XXX Upper layers don't always set this */
 	cmd->mrq = sc->req;
 
+	/* Begin setting up command register. */
+
+	cmdr = cmd->opcode;
+
+	if (sc->host.ios.bus_mode == opendrain)
+		cmdr |= MCI_CMDR_OPDCMD;
+
+	/* Set up response handling.  Allow max timeout for responses. */
+
 	if (MMC_RSP(cmd->flags) == MMC_RSP_NONE)
 		cmdr |= MCI_CMDR_RSPTYP_NO;
 	else {
-		/* Allow big timeout for responses */
 		cmdr |= MCI_CMDR_MAXLAT;
 		if (cmd->flags & MMC_RSP_136)
 			cmdr |= MCI_CMDR_RSPTYP_136;
 		else
 			cmdr |= MCI_CMDR_RSPTYP_48;
 	}
-	if (cmd->opcode == MMC_STOP_TRANSMISSION)
-		cmdr |= MCI_CMDR_TRCMD_STOP;
-	if (sc->host.ios.bus_mode == opendrain)
-		cmdr |= MCI_CMDR_OPDCMD;
-	if (!data) {
-		// The no data case is fairly simple
+
+	/*
+	 * If there is no data transfer, just set up the right interrupt mask
+	 * and start the command.
+	 *
+	 * The interrupt mask needs to be CMDRDY plus all non-data-transfer
+	 * errors. It's important to leave the transfer-related errors out, to
+	 * avoid spurious timeout or crc errors on a STOP command following a
+	 * multiblock read.  When a multiblock read is in progress, sending a
+	 * STOP in the middle of a block occasionally triggers such errors, but
+	 * we're totally disinterested in them because we've already gotten all
+	 * the data we wanted without error before sending the STOP command.
+	 */
+
+	if (data == NULL) {
+		uint32_t ier = MCI_SR_CMDRDY | 
+		    MCI_SR_RTOE | MCI_SR_RENDE | 
+		    MCI_SR_RCRCE | MCI_SR_RDIRE | MCI_SR_RINDE;
+
 		at91_mci_pdc_disable(sc);
-//		printf("CMDR %x ARGR %x\n", cmdr, cmd->arg);
+
+		if (cmd->opcode == MMC_STOP_TRANSMISSION)
+			cmdr |= MCI_CMDR_TRCMD_STOP;
+
+		/* Ignore response CRC on CMD2 and ACMD41, per standard. */
+
+		if (cmd->opcode == MMC_SEND_OP_COND ||
+		    cmd->opcode == ACMD_SD_SEND_OP_COND)
+			ier &= ~MCI_SR_RCRCE;
+
+		if (mci_debug)
+			printf("CMDR %x (opcode %d) ARGR %x no data\n", 
+			    cmdr, cmd->opcode, cmd->arg);
+
 		WR4(sc, MCI_ARGR, cmd->arg);
 		WR4(sc, MCI_CMDR, cmdr);
-		WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_CMDRDY);
+		WR4(sc, MCI_IDR, 0xffffffff);
+		WR4(sc, MCI_IER, ier);
 		return;
 	}
+
+	/* There is data, set up the transfer-related parts of the command. */
+
 	if (data->flags & MMC_DATA_READ)
 		cmdr |= MCI_CMDR_TRDIR;
+
 	if (data->flags & (MMC_DATA_READ | MMC_DATA_WRITE))
 		cmdr |= MCI_CMDR_TRCMD_START;
+
 	if (data->flags & MMC_DATA_STREAM)
 		cmdr |= MCI_CMDR_TRTYP_STREAM;
-	if (data->flags & MMC_DATA_MULTI)
+	else if (data->flags & MMC_DATA_MULTI) {
 		cmdr |= MCI_CMDR_TRTYP_MULTIPLE;
-	// Set block size and turn on PDC mode for dma xfer and disable
-	// PDC until we're ready.
-	mr = RD4(sc, MCI_MR) & ~MCI_MR_BLKLEN;
-	WR4(sc, MCI_MR, mr | (data->len << 16) | MCI_MR_PDCMODE);
-	WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS);
-	if (cmdr & MCI_CMDR_TRCMD_START) {
-		len = data->len;
-		if (cmdr & MCI_CMDR_TRDIR)
-			vaddr = cmd->data->data;
-		else {
-			/* Use bounce buffer even if we don't need
-			 * byteswap, since buffer may straddle a page
-			 * boundry, and we don't handle multi-segment
-			 * transfers in hardware.
-			 * (page issues seen from 'bsdlabel -w' which
-			 * uses raw geom access to the volume).
-			 * Greg Ansley (gja (at) ansley.com)
-			 */
-			vaddr = sc->bounce_buffer;
-			src = (uint32_t *)cmd->data->data;
-			dst = (uint32_t *)vaddr;
+		sc->flags |= (data->flags & MMC_DATA_READ) ? 
+				CMD_MULTIREAD : CMD_MULTIWRITE;
+	}
+
+	/*
+	 * Disable PDC until we're ready.
+	 *
+	 * Set block size and turn on PDC mode for dma xfer.
+	 * Note that the block size is the smaller of the amount of data to be
+	 * transferred, or 512 bytes.  The 512 size is fixed by the standard;
+	 * smaller blocks are possible, but never larger.
+	 */
+
+	WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS); 
+
+	mr = RD4(sc,MCI_MR) & ~MCI_MR_BLKLEN; 
+	mr |=  min(data->len, 512) << 16; 
+	WR4(sc, MCI_MR, mr | MCI_MR_PDCMODE|MCI_MR_PDCPADV);
+
+	/*
+	 * Set up DMA.
+	 *
+	 * Use bounce buffers even if we don't need to byteswap, because doing
+	 * multi-block IO with large DMA buffers is way fast (compared to
+	 * single-block IO), even after incurring the overhead of also copying
+	 * from/to the caller's buffers (which may be in non-contiguous physical
+	 * pages).
+	 *
+	 * In an ideal non-byteswap world we could create a dma tag that allows
+	 * for discontiguous segments and do the IO directly from/to the
+	 * caller's buffer(s), using ENDRX/ENDTX interrupts to chain the
+	 * discontiguous buffers through the PDC. Someday.
+	 *
+	 * If a read is bigger than 2k, split it in half so that we can start
+	 * byte-swapping the first half while the second half is on the wire.
+	 * It would be best if we could split it into 8k chunks, but we can't
+	 * always keep up with the byte-swapping due to other system activity,
+	 * and if an RXBUFF interrupt happens while we're still handling the
+	 * byte-swap from the prior buffer (IE, we haven't returned from
+	 * handling the prior interrupt yet), then data will get dropped on the
+	 * floor and we can't easily recover from that.  The right fix for that
+	 * would be to have the interrupt handling only keep the DMA flowing and
+	 * enqueue filled buffers to be byte-swapped in a non-interrupt context.
+	 * Even that won't work on the write side of things though; in that
+	 * context we have to have all the data ready to go before starting the
+	 * dma.
+	 *
+	 * XXX what about stream transfers?
+	 */
+	sc->xfer_offset = 0;
+	sc->bbuf_curidx = 0;
+
+	if (data->flags & (MMC_DATA_READ | MMC_DATA_WRITE)) {
+		uint32_t len;
+		uint32_t remaining = data->len;
+		bus_addr_t paddr;
+		int err;
+
+		if (remaining > (BBCOUNT*BBSIZE))
+			panic("IO read size exceeds MAXDATA\n");
+
+		if (data->flags & MMC_DATA_READ) {
+			if (remaining > 2048) // XXX
+				len = remaining / 2;
+			else
+				len = remaining;
+			err = bus_dmamap_load(sc->dmatag, sc->bbuf_map[0], 
+			    sc->bbuf_vaddr[0], len, at91_mci_getaddr, 
+			    &paddr, BUS_DMA_NOWAIT);
+			if (err != 0)
+				panic("IO read dmamap_load failed\n");
+			bus_dmamap_sync(sc->dmatag, sc->bbuf_map[0], 
+			    BUS_DMASYNC_PREREAD);
+			WR4(sc, PDC_RPR, paddr);
+			WR4(sc, PDC_RCR, len / 4);
+			sc->bbuf_len[0] = len;
+			remaining -= len;
+			if (remaining == 0) {
+				sc->bbuf_len[1] = 0;
+			} else {
+				len = remaining;
+				err = bus_dmamap_load(sc->dmatag, sc->bbuf_map[1], 
+				    sc->bbuf_vaddr[1], len, at91_mci_getaddr, 
+				    &paddr, BUS_DMA_NOWAIT);
+				if (err != 0)
+					panic("IO read dmamap_load failed\n");
+				bus_dmamap_sync(sc->dmatag, sc->bbuf_map[1], 
+				    BUS_DMASYNC_PREREAD);
+				WR4(sc, PDC_RNPR, paddr);
+				WR4(sc, PDC_RNCR, len / 4);
+				sc->bbuf_len[1] = len;
+				remaining -= len;
+			}
+			WR4(sc, PDC_PTCR, PDC_PTCR_RXTEN);
+		} else {
+			len = min(BBSIZE, remaining);
 			/*
 			 * If this is MCI1 revision 2xx controller, apply
 			 * a work-around for the "Data Write Operation and
@@ -530,74 +769,75 @@ at91_mci_start_cmd(struct at91_mci_softc
 			 */
 			if (at91_mci_is_mci1rev2xx() && data->len < 12) {
 				len = 12;
-				memset(dst, 0, 12);
+				memset(data->data, 0, 12);
 			}
-			if (sc->sc_cap & CAP_NEEDS_BYTESWAP) {
-				for (i = 0; i < data->len / 4; i++)
-					dst[i] = bswap32(src[i]);
-			} else
-				memcpy(dst, src, data->len);
-		}
-		data->xfer_len = 0;
-		if (bus_dmamap_load(sc->dmatag, sc->map, vaddr, len,
-		    at91_mci_getaddr, &paddr, 0) != 0) {
-			cmd->error = MMC_ERR_NO_MEMORY;
-			sc->req = NULL;
-			sc->curcmd = NULL;
-			cmd->mrq->done(cmd->mrq);
-			return;
-		}
-		sc->mapped++;
-		if (cmdr & MCI_CMDR_TRDIR) {
-			bus_dmamap_sync(sc->dmatag, sc->map, BUS_DMASYNC_PREREAD);
-			WR4(sc, PDC_RPR, paddr);
-			WR4(sc, PDC_RCR, len / 4);
-			ier = MCI_SR_ENDRX;
-		} else {
-			bus_dmamap_sync(sc->dmatag, sc->map, BUS_DMASYNC_PREWRITE);
-			WR4(sc, PDC_TPR, paddr);
+			at91_bswap_buf(sc, sc->bbuf_vaddr[0], data->data, len);
+			err = bus_dmamap_load(sc->dmatag, sc->bbuf_map[0], 
+			    sc->bbuf_vaddr[0], len, at91_mci_getaddr, 
+			    &paddr, BUS_DMA_NOWAIT);
+			if (err != 0)
+				panic("IO write dmamap_load failed\n");
+			bus_dmamap_sync(sc->dmatag, sc->bbuf_map[0], 
+			    BUS_DMASYNC_PREWRITE);
+			WR4(sc, PDC_TPR,paddr);
 			WR4(sc, PDC_TCR, len / 4);
-			ier = MCI_SR_TXBUFE;
+			sc->bbuf_len[0] = len;
+			remaining -= len;
+			if (remaining == 0) {
+				sc->bbuf_len[1] = 0;
+			} else {
+				len = remaining;
+				at91_bswap_buf(sc, sc->bbuf_vaddr[1],
+				    ((char *)data->data)+BBSIZE, len);
+				err = bus_dmamap_load(sc->dmatag, sc->bbuf_map[1], 
+				    sc->bbuf_vaddr[1], len, at91_mci_getaddr, 
+				    &paddr, BUS_DMA_NOWAIT);
+				if (err != 0)
+					panic("IO write dmamap_load failed\n");
+				bus_dmamap_sync(sc->dmatag, sc->bbuf_map[1], 
+				    BUS_DMASYNC_PREWRITE);
+				WR4(sc, PDC_TNPR, paddr);
+				WR4(sc, PDC_TNCR, len / 4);
+				sc->bbuf_len[1] = len;
+				remaining -= len;
+			}
+			/* do not enable PDC xfer until CMDRDY asserted */
 		}
+		data->xfer_len = 0; /* XXX what's this? appears to be unused. */
 	}
-//	printf("CMDR %x ARGR %x with data\n", cmdr, cmd->arg);
+
+	if (mci_debug)
+		printf("CMDR %x (opcode %d) ARGR %x with data len %d\n", 
+		       cmdr, cmd->opcode, cmd->arg, cmd->data->len);
+
 	WR4(sc, MCI_ARGR, cmd->arg);
-	if (cmdr & MCI_CMDR_TRCMD_START) {
-		if (cmdr & MCI_CMDR_TRDIR) {
-			WR4(sc, PDC_PTCR, PDC_PTCR_RXTEN);
-			WR4(sc, MCI_CMDR, cmdr);
-		} else {
-			WR4(sc, MCI_CMDR, cmdr);
-			WR4(sc, PDC_PTCR, PDC_PTCR_TXTEN);
-		}
-	}
-	WR4(sc, MCI_IER, MCI_SR_ERROR | ier);
+	WR4(sc, MCI_CMDR, cmdr);
+	WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_CMDRDY);
 }
 
 static void
-at91_mci_start(struct at91_mci_softc *sc)
+at91_mci_next_operation(struct at91_mci_softc *sc)
 {
 	struct mmc_request *req;
 
 	req = sc->req;
 	if (req == NULL)
 		return;
-	// assert locked
-	if (!(sc->flags & CMD_STARTED)) {
-		sc->flags |= CMD_STARTED;
-//		printf("Starting CMD\n");
+
+	if (sc->flags & PENDING_CMD) {
+		sc->flags &= ~PENDING_CMD;
 		at91_mci_start_cmd(sc, req->cmd);
 		return;
-	}
-	if (!(sc->flags & STOP_STARTED) && req->stop) {
-//		printf("Starting Stop\n");
-		sc->flags |= STOP_STARTED;
+	} else if (sc->flags & PENDING_STOP) {
+		sc->flags &= ~PENDING_STOP;
 		at91_mci_start_cmd(sc, req->stop);
 		return;
 	}
-	/* We must be done -- bad idea to do this while locked? */
+
+	WR4(sc, MCI_IDR, 0xffffffff);
 	sc->req = NULL;
 	sc->curcmd = NULL;
+	//printf("req done\n");
 	req->done(req);
 }
 
@@ -607,16 +847,16 @@ at91_mci_request(device_t brdev, device_
 	struct at91_mci_softc *sc = device_get_softc(brdev);
 
 	AT91_MCI_LOCK(sc);
-	// XXX do we want to be able to queue up multiple commands?
-	// XXX sounds like a good idea, but all protocols are sync, so
-	// XXX maybe the idea is naive...
 	if (sc->req != NULL) {
 		AT91_MCI_UNLOCK(sc);
 		return (EBUSY);
 	}
+	//printf("new req\n");
 	sc->req = req;
-	sc->flags = 0;
-	at91_mci_start(sc);
+	sc->flags = PENDING_CMD;
+	if (sc->req->stop)
+		sc->flags |= PENDING_STOP;
+	at91_mci_next_operation(sc);
 	AT91_MCI_UNLOCK(sc);
 	return (0);
 }
@@ -654,120 +894,351 @@ at91_mci_release_host(device_t brdev, de
 }
 
 static void
-at91_mci_read_done(struct at91_mci_softc *sc)
+at91_mci_read_done(struct at91_mci_softc *sc, uint32_t sr)
 {
-	uint32_t *walker;
-	struct mmc_command *cmd;
-	int i, len;
-
-	cmd = sc->curcmd;
-	bus_dmamap_sync(sc->dmatag, sc->map, BUS_DMASYNC_POSTREAD);
-	bus_dmamap_unload(sc->dmatag, sc->map);
-	sc->mapped--;
-	if (sc->sc_cap & CAP_NEEDS_BYTESWAP) {
-		walker = (uint32_t *)cmd->data->data;
-		len = cmd->data->len / 4;
-		for (i = 0; i < len; i++)
-			walker[i] = bswap32(walker[i]);
-	}
-	// Finish up the sequence...
-	WR4(sc, MCI_IDR, MCI_SR_ENDRX);
-	WR4(sc, MCI_IER, MCI_SR_RXBUFF);
-	WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS);
+	struct mmc_command *cmd = sc->curcmd;
+	char * dataptr = (char *)cmd->data->data;
+	uint32_t curidx = sc->bbuf_curidx;
+	uint32_t len = sc->bbuf_len[curidx];
+
+	/*
+	 * We arrive here when a DMA transfer for a read is done, whether it's
+	 * a single or multi-block read.
+	 *
+	 * We byte-swap the buffer that just completed, and if that is the
+	 * last buffer that's part of this read then we move on to the next
+	 * operation, otherwise we wait for another ENDRX for the next bufer.
+	 */
+
+	bus_dmamap_sync(sc->dmatag, sc->bbuf_map[curidx], BUS_DMASYNC_POSTREAD);
+	bus_dmamap_unload(sc->dmatag, sc->bbuf_map[curidx]);
+
+	at91_bswap_buf(sc, dataptr + sc->xfer_offset, sc->bbuf_vaddr[curidx], len);
+
+	if (mci_debug) {
+		printf("read done sr %x curidx %d len %d xfer_offset %d\n",
+		       sr, curidx, len, sc->xfer_offset);
+	}
+
+	sc->xfer_offset += len;
+	sc->bbuf_curidx = !curidx; /* swap buffers */
+
+	/*
+	 * If we've transferred all the data, move on to the next operation.
+	 *
+	 * If we're still transferring the last buffer, RNCR is already zero but
+	 * we have to write a zero anyway to clear the ENDRX status so we don't
+	 * re-interrupt until the last buffer is done.
+	 */
+	if (sc->xfer_offset == cmd->data->len) {
+		WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS);
+		cmd->error = MMC_ERR_NONE;
+		at91_mci_next_operation(sc);
+	} else {
+		WR4(sc, PDC_RNCR, 0);
+		WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_ENDRX);
+	}
 }
 
 static void
-at91_mci_xmit_done(struct at91_mci_softc *sc)
+at91_mci_write_done(struct at91_mci_softc *sc, uint32_t sr)
 {
-	// Finish up the sequence...
+	struct mmc_command *cmd = sc->curcmd;
+
+	/*
+	 * We arrive here when the entire DMA transfer for a write is done,
+	 * whether it's a single or multi-block write.  If it's multi-block we
+	 * have to immediately move on to the next operation which is to send
+	 * the stop command.  If it's a single-block transfer we need to wait
+	 * for NOTBUSY, but if that's already asserted we can avoid another
+	 * interrupt and just move on to completing the request right away.
+	 */
+
 	WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS);
-	WR4(sc, MCI_IDR, MCI_SR_TXBUFE);
-	WR4(sc, MCI_IER, MCI_SR_NOTBUSY);
-	bus_dmamap_sync(sc->dmatag, sc->map, BUS_DMASYNC_POSTWRITE);
-	bus_dmamap_unload(sc->dmatag, sc->map);
-	sc->mapped--;
+
+	bus_dmamap_sync(sc->dmatag, sc->bbuf_map[sc->bbuf_curidx],
+	    BUS_DMASYNC_POSTWRITE);
+	bus_dmamap_unload(sc->dmatag, sc->bbuf_map[sc->bbuf_curidx]);
+
+	if ((cmd->data->flags & MMC_DATA_MULTI) || (sr & MCI_SR_NOTBUSY)) {
+		cmd->error = MMC_ERR_NONE;
+		at91_mci_next_operation(sc);
+	} else {
+		WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_NOTBUSY);
+	}
+}
+
+static void
+at91_mci_notbusy(struct at91_mci_softc *sc)
+{
+	struct mmc_command *cmd = sc->curcmd;
+
+	/*
+	 * We arrive here by either completion of a single-block write, or
+	 * completion of the stop command that ended a multi-block write (and,
+	 * I suppose, after a card-select or erase, but I haven't tested
+	 * those).  Anyway, we're done and it's time to move on to the next
+	 * command.
+	 */
+
+	cmd->error = MMC_ERR_NONE;
+	at91_mci_next_operation(sc);
+}
+
+static void
+at91_mci_stop_done(struct at91_mci_softc *sc, uint32_t sr)
+{
+	struct mmc_command *cmd = sc->curcmd;
+
+	/*
+	 * We arrive here after receiving CMDRDY for a MMC_STOP_TRANSMISSION
+	 * command.  Depending on the operation being stopped, we may have to
+	 * do some unusual things to work around hardware bugs.
+	 */
+
+	/*
+	 * This is known to be true of at91rm9200 hardware; it may or may not
+	 * apply to more recent chips: 
+	 *
+	 * After stopping a multi-block write, the NOTBUSY bit in MCI_SR does
+	 * not properly reflect the actual busy state of the card as signaled
+	 * on the DAT0 line; it always claims the card is not-busy.  If we
+	 * believe that and let operations continue, following commands will
+	 * fail with response timeouts (except of course MMC_SEND_STATUS -- it
+	 * indicates the card is busy in the PRG state, which was the smoking
+	 * gun that showed MCI_SR NOTBUSY was not tracking DAT0 correctly).
+	 *
+	 * The atmel docs are emphatic: "This flag [NOTBUSY] must be used only
+	 * for Write Operations."  I guess technically since we sent a stop
+	 * it's not a write operation anymore.  But then just what did they
+	 * think it meant for the stop command to have "...an optional busy
+	 * signal transmitted on the data line" according to the SD spec?
+	 *
+	 * I tried a variety of things to un-wedge the MCI and get the status
+	 * register to reflect NOTBUSY correctly again, but the only thing
+	 * that worked was a full device reset.  It feels like an awfully big
+	 * hammer, but doing a full reset after every multiblock write is
+	 * still faster than doing single-block IO (by almost two orders of
+	 * magnitude: 20KB/sec improves to about 1.8MB/sec best case).
+	 *
+	 * After doing the reset, wait for a NOTBUSY interrupt before
+	 * continuing with the next operation.
+	 */
+	if (sc->flags & CMD_MULTIWRITE) {
+		at91_mci_reset(sc);
+		WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_NOTBUSY);
+		return;
+	}
+
+	/*
+	 * This is known to be true of at91rm9200 hardware; it may or may not
+	 * apply to more recent chips:
+	 *
+	 * After stopping a multi-block read, loop to read and discard any
+	 * data that coasts in after we sent the stop command.  The docs don't
+	 * say anything about it, but empirical testing shows that 1-3
+	 * additional words of data get buffered up in some unmentioned
+	 * internal fifo and if we don't read and discard them here they end
+	 * up on the front of the next read DMA transfer we do.
+	 */
+	if (sc->flags & CMD_MULTIREAD) {
+		uint32_t sr;
+		int count = 0;
+
+		do {
+			sr = RD4(sc, MCI_SR);
+			if (sr & MCI_SR_RXRDY) {
+				RD4(sc,  MCI_RDR);
+				++count;
+			}
+		} while (sr & MCI_SR_RXRDY);
+		at91_mci_reset(sc);
+//              if (count != 0)
+//                      printf("Had to soak up %d words after read\n", count);
+	}
+
+	cmd->error = MMC_ERR_NONE;
+	at91_mci_next_operation(sc);
+
+}
+
+static void
+at91_mci_cmdrdy(struct at91_mci_softc *sc, uint32_t sr)
+{
+	struct mmc_command *cmd = sc->curcmd;
+	int i;
+
+	if (cmd == NULL)
+		return;
+
+	/*
+	 * We get here at the end of EVERY command.  We retrieve the command
+	 * response (if any) then decide what to do next based on the command.
+	 */
+
+	if (cmd->flags & MMC_RSP_PRESENT) {
+		for (i = 0; i < ((cmd->flags & MMC_RSP_136) ? 4 : 1); i++) {
+			cmd->resp[i] = RD4(sc, MCI_RSPR + i * 4);
+			if (mci_debug)
+				printf("RSPR[%d] = %x sr=%x\n", i, cmd->resp[i],  sr);
+		}
+	}
+
+	/*
+	 * If this was a stop command, go handle the various special
+	 * conditions (read: bugs) that have to be dealt with following a stop.
+	 */
+	if (cmd->opcode == MMC_STOP_TRANSMISSION) {
+		at91_mci_stop_done(sc, sr);
+		return;
+	}
+
+	/*
+	 * If this command can continue to assert BUSY beyond the response then
+	 * we need to wait for NOTBUSY before the command is really done.
+	 *
+	 * Note that this may not work properly on the at91rm9200.  It certainly
+	 * doesn't work for the STOP command that follows a multi-block write,
+	 * so post-stop CMDRDY is handled separately; see the special handling
+	 * in at91_mci_stop_done().
+	 *
+	 * Beside STOP, there are other R1B-type commands that use the busy
+	 * signal after CMDRDY: CMD7 (card select), CMD28-29 (write protect),
+	 * CMD38 (erase). I haven't tested any of them, but I rather expect
+	 * them all to have the same sort of problem with MCI_SR not actually
+	 * reflecting the state of the DAT0-line busy indicator.  So this code
+	 * may need to grow some sort of special handling for them too. (This
+	 * just in: CMD7 isn't a problem right now because dev/mmc.c incorrectly
+	 * sets the response flags to R1 rather than R1B.) XXX
+	 */
+	if ((cmd->flags & MMC_RSP_BUSY)) {
+		WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_NOTBUSY);
+		return;
+	}
+
+	/*
+	 * If there is a data transfer with this command, then...
+	 * - If it's a read, we need to wait for ENDRX.
+	 * - If it's a write, now is the time to enable the PDC, and we need
+	 *   to wait for a BLKE that follows a TXBUFE, because if we're doing
+	 *   a split transfer we get a BLKE after the first half (when TPR/TCR
+	 *   get loaded from TNPR/TNCR).  So first we wait for the TXBUFE, and
+	 *   the handling for that interrupt will then invoke the wait for the
+	 *   subsequent BLKE which indicates actual completion.
+	 */
+	if (cmd->data) {
+		uint32_t ier;
+		if (cmd->data->flags & MMC_DATA_READ) {
+			ier = MCI_SR_ENDRX;
+		} else {
+			ier = MCI_SR_TXBUFE;
+			WR4(sc, PDC_PTCR, PDC_PTCR_TXTEN);
+		}
+		WR4(sc, MCI_IER, MCI_SR_ERROR | ier);
+		return;
+	}
+
+	/*
+	 * If we made it to here, we don't need to wait for anything more for
+	 * the current command, move on to the next command (will complete the
+	 * request if there is no next command).
+	 */
+	cmd->error = MMC_ERR_NONE;
+	at91_mci_next_operation(sc);
 }
 
 static void
 at91_mci_intr(void *arg)
 {
 	struct at91_mci_softc *sc = (struct at91_mci_softc*)arg;
-	uint32_t sr;
-	int i, done = 0;
-	struct mmc_command *cmd;
+	struct mmc_command *cmd = sc->curcmd;
+	uint32_t sr, isr;
 
 	AT91_MCI_LOCK(sc);
-	sr = RD4(sc, MCI_SR) & RD4(sc, MCI_IMR);
-//	printf("i 0x%x\n", sr);
-	cmd = sc->curcmd;
-	if (sr & MCI_SR_ERROR) {
-		// Ignore CRC errors on CMD2 and ACMD47, per relevant standards
-		if ((sr & MCI_SR_RCRCE) && (cmd->opcode == MMC_SEND_OP_COND ||
-		    cmd->opcode == ACMD_SD_SEND_OP_COND))
-			cmd->error = MMC_ERR_NONE;
-		else if (sr & (MCI_SR_RTOE | MCI_SR_DTOE))
+
+	sr = RD4(sc, MCI_SR);
+	isr = sr & RD4(sc, MCI_IMR);
+
+	if (mci_debug)
+		printf("i 0x%x sr 0x%x\n", isr, sr);
+
+	/*
+	 * All interrupts are one-shot; disable it now.
+	 * The next operation will re-enable whatever interrupts it wants.
+	 */
+	WR4(sc, MCI_IDR, isr);
+	if (isr & MCI_SR_ERROR) {
+		if (isr & (MCI_SR_RTOE | MCI_SR_DTOE))
 			cmd->error = MMC_ERR_TIMEOUT;
-		else if (sr & (MCI_SR_RCRCE | MCI_SR_DCRCE))
+		else if (isr & (MCI_SR_RCRCE | MCI_SR_DCRCE))
 			cmd->error = MMC_ERR_BADCRC;
-		else if (sr & (MCI_SR_OVRE | MCI_SR_UNRE))
+		else if (isr & (MCI_SR_OVRE | MCI_SR_UNRE))
 			cmd->error = MMC_ERR_FIFO;
 		else
 			cmd->error = MMC_ERR_FAILED;
-		done = 1;
-		if (sc->mapped && cmd->error) {
-			bus_dmamap_unload(sc->dmatag, sc->map);
-			sc->mapped--;
+		/*
+		 * CMD8 is used to probe for SDHC cards, a standard SD card
+		 * will get a response timeout; don't report it because it's a
+		 * normal and expected condition.  One might argue that all
+		 * error reporting should be left to higher levels, but when
+		 * they report at all it's always EIO, which isn't very
+		 * helpful. XXX bootverbose?
+		 */
+		if (cmd->opcode != 8) {
+			device_printf(sc->dev, 

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201208280128.q7S1SriX088038>