From owner-freebsd-arm@FreeBSD.ORG Tue Apr 19 18:10:11 2011 Return-Path: Delivered-To: freebsd-arm@hub.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 2D6DC1065678 for ; Tue, 19 Apr 2011 18:10:11 +0000 (UTC) (envelope-from gnats@FreeBSD.org) Received: from freefall.freebsd.org (freefall.freebsd.org [IPv6:2001:4f8:fff6::28]) by mx1.freebsd.org (Postfix) with ESMTP id 189178FC0A for ; Tue, 19 Apr 2011 18:10:11 +0000 (UTC) Received: from freefall.freebsd.org (localhost [127.0.0.1]) by freefall.freebsd.org (8.14.4/8.14.4) with ESMTP id p3JIABF3019632 for ; Tue, 19 Apr 2011 18:10:11 GMT (envelope-from gnats@freefall.freebsd.org) Received: (from gnats@localhost) by freefall.freebsd.org (8.14.4/8.14.4/Submit) id p3JIAB8e019631; Tue, 19 Apr 2011 18:10:11 GMT (envelope-from gnats) Date: Tue, 19 Apr 2011 18:10:11 GMT Message-Id: <201104191810.p3JIAB8e019631@freefall.freebsd.org> To: freebsd-arm@FreeBSD.org From: Ian Lepore Cc: Subject: Re: arm/155214: [patch] MMC/SD IO slow on Atmel ARM with modern large SD cards (updated patch) X-BeenThere: freebsd-arm@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list Reply-To: Ian Lepore List-Id: Porting FreeBSD to the StrongARM Processor List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 19 Apr 2011 18:10:11 -0000 The following reply was made to PR arm/155214; it has been noted by GNATS. From: Ian Lepore To: bug-followup@FreeBSD.org, freebsd@damnhippie.dyndns.org Cc: Subject: Re: arm/155214: [patch] MMC/SD IO slow on Atmel ARM with modern large SD cards (updated patch) Date: Tue, 19 Apr 2011 12:06:43 -0600 Index: sys/arm/at91/at91_mci.c =================================================================== --- sys/arm/at91/at91_mci.c.cvs_v1.18 2011-03-30 08:30:18.000000000 -0600 +++ sys/arm/at91/at91_mci.c 2011-04-18 12:04:55.000000000 -0600 @@ -67,7 +67,60 @@ #include "opt_at91.h" -#define BBSZ 512 +/* About running the MCI bus at 30mhz... + * + * Historically, the MCI bus has been run at 30mhz on systems with a 60mhz + * master clock, due to a bug in the mantissa table in dev/mmc.c making it + * appear that the card's max speed was always 30mhz. Fixing that bug causes + * the mmc driver to request a 25mhz clock (as it should) and the logic in + * at91_mci_update_ios() picks the highest speed that doesn't exceed that limit. + * With a 60mhz MCK that would be 15mhz, and that's a real performance buzzkill + * when you've been getting away with 30mhz all along. + * + * By defining AT91_MCI_USE_30MHZ (or setting the 30mhz=1 device hint or sysctl) + * you can enable logic in at91_mci_update_ios() to set the mci bus to 30mhz + * when MCK is 60mhz and the requested speed is 25mhz. This appears to work on + * virtually all SD cards, since it is what this driver has been doing by + * accident since day one. I've seen modern SD cards run at 45mhz/1-bit in + * standard mode (high speed mode enable commands not sent) without problems. + * + * Speaking of high-speed mode, the rm9200 manual says the MCI device supports + * the SD v1.0 specification and can run up to 50mhz. This is interesting in + * that the SD v1.0 spec caps the speed at 25mhz; high speed mode was added in + * the v1.10 spec. Furthermore, high speed mode doesn't just crank up the + * clock, it alters the signal timing. The rm9200 MCI device doesn't support + * these altered timings. So while speeds over 25mhz may work, they only work + * in what the SD spec calls "default" speed mode, and it amounts to violating + * the spec by overclocking the bus. + * + * If you also enable 4-wire mode it's possible the 30mhz transfers will fail. + * If you have the USB host device and OHCI driver enabled it's g'teed to fail + * (I get intermittant overrun and underrun errors even at 15mhz 4-wire with + * OHCI). Note that you don't even need to have usb devices attached to the + * system, the errors begin to occur as soon as the OHCI driver sets the + * register bit to enable periodic transfers. It appears (based on brief + * investigation) that the usb host controller uses so much ASB bandwidth that + * sometimes the DMA for MCI transfers doesn't get a bus grant in time and data + * gets dropped. Adding even a modicum of network activity changes the symptom + * from intermittant to very frequent. + */ + +#ifndef AT91_MCI_USE_30MHZ +#define AT91_MCI_USE_30MHZ 1 +#endif + +/* Allocate 2 bounce buffers, each being sized to half the system default + * physical IO size. That enables doing DFLTPHYS sized transfers at a time, + * with read transfers in particular being split into two operations so that we + * can overlap some of the byte-swapping needed due to the rm9200 erratum with + * the DMA for the second half of the transfer. + */ + +#define BBCOUNT 2 +#define BBSIZE (DFLTPHYS/BBCOUNT) +#define MAX_BLOCKS ((BBSIZE*BBCOUNT)/512) + +static int mci_debug; struct at91_mci_softc { void *intrhand; /* Interrupt handle */ @@ -75,21 +128,26 @@ int sc_cap; #define CAP_HAS_4WIRE 1 /* Has 4 wire bus */ #define CAP_NEEDS_BYTESWAP 2 /* broken hardware needing bounce */ - int flags; int has_4wire; -#define CMD_STARTED 1 -#define STOP_STARTED 2 + int use_30mhz; + int flags; +#define PENDING_CMD 0x01 +#define PENDING_STOP 0x02 +#define CMD_MULTIREAD 0x10 +#define CMD_MULTIWRITE 0x20 struct resource *irq_res; /* IRQ resource */ struct resource *mem_res; /* Memory resource */ struct mtx sc_mtx; bus_dma_tag_t dmatag; - bus_dmamap_t map; - int mapped; struct mmc_host host; int bus_busy; struct mmc_request *req; struct mmc_command *curcmd; - char bounce_buffer[BBSZ]; + bus_dmamap_t bbuf_map[BBCOUNT]; + char * bbuf_vaddr[BBCOUNT]; /* bounce bufs in KVA space */ + uint32_t bbuf_len[BBCOUNT]; /* len currently queued for bounce buf */ + uint32_t bbuf_curidx; /* which bbuf is the active DMA buffer */ + uint32_t xfer_offset; /* offset so far into caller's buf */ }; static inline uint32_t @@ -123,6 +181,47 @@ #define AT91_MCI_ASSERT_LOCKED(_sc) mtx_assert(&_sc->sc_mtx, MA_OWNED); #define AT91_MCI_ASSERT_UNLOCKED(_sc) mtx_assert(&_sc->sc_mtx, MA_NOTOWNED); +static void +at91_bswap_buf(struct at91_mci_softc *sc, void * dptr, void * sptr, uint32_t memsize) +{ + uint32_t * dst = (uint32_t *)dptr; + uint32_t * src = (uint32_t *)sptr; + uint32_t i; + + /* If the hardware doesn't need byte-swapping, let bcopy() do the work. + */ + + if (!(sc->sc_cap & CAP_NEEDS_BYTESWAP)) { + bcopy(dptr, sptr, memsize); + return; + } + + /* Nice performance boost for slightly unrolling this loop. + * (But very little extra boost for further unrolling it.) + */ + + for (i = 0; i < memsize; i += 16) { + *dst++ = bswap32(*src++); + *dst++ = bswap32(*src++); + *dst++ = bswap32(*src++); + *dst++ = bswap32(*src++); + } + + /* Mop up the last 1-3 words, if any. */ + + for (i = 0; i < (memsize & 0x0F); i += 4) { + *dst++ = bswap32(*src++); + } +} + +static void +at91_mci_getaddr(void *arg, bus_dma_segment_t *segs, int nsegs, int error) +{ + if (error != 0) + return; + *(bus_addr_t *)arg = segs[0].ds_addr; +} + static void at91_mci_pdc_disable(struct at91_mci_softc *sc) { @@ -137,15 +236,57 @@ WR4(sc, PDC_TNCR, 0); } +/* Reset the controller, then restore most of the current state. + * + * This is called after detecting an error. It's also called after stopping a + * multi-block write, to un-wedge the device so that it will handle the NOTBUSY + * signal correctly. See comments in at91_mci_stop_done() for more details. + */ +static void at91_mci_reset(struct at91_mci_softc *sc) +{ + uint32_t mr; + uint32_t sdcr; + uint32_t dtor; + uint32_t imr; + + at91_mci_pdc_disable(sc); + + /* save current state */ + + imr = RD4(sc, MCI_IMR); + mr = RD4(sc, MCI_MR) & 0x7fff; + sdcr = RD4(sc, MCI_SDCR); + dtor = RD4(sc, MCI_DTOR); + + /* reset the controller */ + + WR4(sc, MCI_IDR, 0xffffffff); + WR4(sc, MCI_CR, MCI_CR_MCIDIS | MCI_CR_SWRST); + + /* restore state */ + + WR4(sc, MCI_CR, MCI_CR_MCIEN|MCI_CR_PWSEN); + WR4(sc, MCI_MR, mr); + WR4(sc, MCI_SDCR, sdcr); + WR4(sc, MCI_DTOR, dtor); + WR4(sc, MCI_IER, imr); + + /* Make sure sdio interrupts will fire. Not sure why reading + * SR ensures that, but this is in the linux driver. + */ + + RD4(sc, MCI_SR); +} + static void at91_mci_init(device_t dev) { struct at91_mci_softc *sc = device_get_softc(dev); - WR4(sc, MCI_CR, MCI_CR_MCIEN); /* Enable controller */ + WR4(sc, MCI_CR, MCI_CR_MCIDIS | MCI_CR_SWRST); /* Put the device into reset */ WR4(sc, MCI_IDR, 0xffffffff); /* Turn off interrupts */ WR4(sc, MCI_DTOR, MCI_DTOR_DTOMUL_1M | 1); - WR4(sc, MCI_MR, 0x834a); // XXX GROSS HACK FROM LINUX + WR4(sc, MCI_MR, 0x834a); // set PDCMODE, PWSDIV=3, CLKDIV=75 #ifndef AT91_MCI_SLOT_B WR4(sc, MCI_SDCR, 0); /* SLOT A, 1 bit bus */ #else @@ -153,6 +294,11 @@ * a two slot card that we know of. XXX */ WR4(sc, MCI_SDCR, 1); /* SLOT B, 1 bit bus */ #endif + /* Enable controller, including power-save. The slower clock of + * the power-save mode is only in effect when there is no transfer in + * progress, so it can be left in this mode all the time. + */ + WR4(sc, MCI_CR, MCI_CR_MCIEN|MCI_CR_PWSEN); } static void @@ -167,8 +313,8 @@ static int at91_mci_probe(device_t dev) -{ +{ device_set_desc(dev, "MCI mmc/sd host bridge"); return (0); } @@ -180,6 +326,7 @@ struct sysctl_ctx_list *sctx; struct sysctl_oid *soid; device_t child; + int i; int err; sc->dev = dev; @@ -193,48 +340,100 @@ AT91_MCI_LOCK_INIT(sc); - /* - * Allocate DMA tags and maps + at91_mci_fini(dev); + at91_mci_init(dev); + + /* Allocate DMA tags and maps and bounce buffers. + * + * The parms in the tag_create call cause the dmamem_alloc call to + * create each bounce buffer as a single contiguous buffer of BBSIZE + * bytes aligned to a 4096 byte boundary. + * + * Do not use DMA_COHERENT for these buffers because that maps the + * memory as non-cachable, which prevents cache line burst fills/writes, + * which is something we need since we're trying to overlap the + * byte-swapping with the DMA operations. */ - err = bus_dma_tag_create(bus_get_dma_tag(dev), 1, 0, - BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL, MAXPHYS, 1, - MAXPHYS, BUS_DMA_ALLOCNOW, NULL, NULL, &sc->dmatag); - if (err != 0) - goto out; - err = bus_dmamap_create(sc->dmatag, 0, &sc->map); + err = bus_dma_tag_create(bus_get_dma_tag(dev), 4096, 0, + BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL, + BBSIZE, 1, MAXPHYS, 0, NULL, NULL, &sc->dmatag); if (err != 0) goto out; - at91_mci_fini(dev); - at91_mci_init(dev); + for (i = 0; i < BBCOUNT; ++i) { + err = bus_dmamem_alloc(sc->dmatag, (void **)&sc->bbuf_vaddr[i], + BUS_DMA_NOWAIT, &sc->bbuf_map[i]); + if (err != 0) + goto out; + } /* * Activate the interrupt */ - err = bus_setup_intr(dev, sc->irq_res, INTR_TYPE_MISC | INTR_MPSAFE, + err = bus_setup_intr(dev, sc->irq_res, INTR_TYPE_BIO | INTR_MPSAFE, NULL, at91_mci_intr, sc, &sc->intrhand); if (err) { AT91_MCI_LOCK_DESTROY(sc); goto out; } + /* Allow 4-wire to be initially set via #define. + * Allow a device hint to override that. + * Allow a sysctl to override that. + */ + +#if defined(AT91_MCI_HAS_4WIRE) && AT91_MCI_HAS_4WIRE != 0 + sc->has_4wire = 1; +#else + sc->has_4wire = 0; +#endif + + resource_int_value(device_get_name(dev), device_get_unit(dev), + "4wire", &sc->has_4wire); + sctx = device_get_sysctl_ctx(dev); soid = device_get_sysctl_tree(dev); SYSCTL_ADD_UINT(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "4wire", CTLFLAG_RW, &sc->has_4wire, 0, "has 4 wire SD Card bus"); -#ifdef AT91_MCI_HAS_4WIRE - sc->has_4wire = 1; -#endif if (sc->has_4wire) sc->sc_cap |= CAP_HAS_4WIRE; - sc->host.f_min = at91_master_clock / 512; + /* Allow use_30mhz to be initially set via #define. + * Allow a device hint to override that. + * Allow a sysctl to override that. + */ + +#if defined(AT91_MCI_USE_30MHZ) && AT91_MCI_USE_30MHZ != 0 + sc->use_30mhz = 1; +#else + sc->use_30mhz = 0; +#endif + + resource_int_value(device_get_name(dev), device_get_unit(dev), + "30mhz", &sc->use_30mhz); + + sctx = device_get_sysctl_ctx(dev); + soid = device_get_sysctl_tree(dev); + SYSCTL_ADD_UINT(sctx, SYSCTL_CHILDREN(soid), OID_AUTO, "30mhz", + CTLFLAG_RW, &sc->use_30mhz, 0, "use 30mhz clock for 25mhz request"); + + /* Our real min freq is master_clock/512, but upper driver layers are + * going to set the min speed during card discovery, and the right speed + * for that is 400khz, so advertise a safe value just under that. + * + * For max speed, while the rm9200 manual says the max is 50mhz, it also + * says it supports only the SD v1.0 spec, which means the real limit is + * 25mhz. On the other hand, historical use has been to slightly violate + * the standard by running the bus at 30mhz. For more information on + * that, see the comments at the top of this file. + */ + sc->host.f_min = 375000; sc->host.f_max = at91_master_clock / 2; - if (sc->host.f_max > 50000000) - sc->host.f_max = 50000000; /* Limit to 50MHz */ + if (sc->host.f_max > 25000000) + sc->host.f_max = 25000000; sc->host.host_ocr = MMC_OCR_320_330 | MMC_OCR_330_340; sc->host.caps = 0; @@ -252,8 +451,15 @@ static int at91_mci_detach(device_t dev) { + struct at91_mci_softc *sc = device_get_softc(dev); + at91_mci_fini(dev); at91_mci_deactivate(dev); + + bus_dmamem_free(sc->dmatag, sc->bbuf_vaddr[0], sc->bbuf_map[0]); + bus_dmamem_free(sc->dmatag, sc->bbuf_vaddr[1], sc->bbuf_map[1]); + bus_dma_tag_destroy(sc->dmatag); + return (EBUSY); /* XXX */ } @@ -293,7 +499,7 @@ sc->intrhand = 0; bus_generic_detach(sc->dev); if (sc->mem_res) - bus_release_resource(dev, SYS_RES_IOPORT, + bus_release_resource(dev, SYS_RES_MEMORY, rman_get_rid(sc->mem_res), sc->mem_res); sc->mem_res = 0; if (sc->irq_res) @@ -303,14 +509,6 @@ return; } -static void -at91_mci_getaddr(void *arg, bus_dma_segment_t *segs, int nsegs, int error) -{ - if (error != 0) - return; - *(bus_addr_t *)arg = segs[0].ds_addr; -} - static int at91_mci_update_ios(device_t brdev, device_t reqdev) { @@ -322,16 +520,31 @@ sc = device_get_softc(brdev); host = &sc->host; ios = &host->ios; - // bus mode? + + /* Calculate our closest available clock speed that doesn't exceed the + * requested speed. + * + * If the master clock is running at 60mhz and the requested bus speed + * is 25mhz and the use_30mhz flag is on, set clkdiv to zero to get a + * 30mhz mci clock. See comments near the top of the file for more info. + * + * Whatever we come up with, store it back into ios->clock so that the + * upper layer drivers can report the actual speed of the bus. + */ + if (ios->clock == 0) { WR4(sc, MCI_CR, MCI_CR_MCIDIS); clkdiv = 0; } else { - WR4(sc, MCI_CR, MCI_CR_MCIEN); - if ((at91_master_clock % (ios->clock * 2)) == 0) + WR4(sc, MCI_CR, MCI_CR_MCIEN|MCI_CR_PWSEN); + if (sc->use_30mhz && at91_master_clock == 60000000 && + ios->clock == 25000000) + clkdiv = 0; + else if ((at91_master_clock % (ios->clock * 2)) == 0) clkdiv = ((at91_master_clock / ios->clock) / 2) - 1; else clkdiv = (at91_master_clock / ios->clock) / 2; + ios->clock = at91_master_clock / ((clkdiv+1) * 2); } if (ios->bus_width == bus_width_4) WR4(sc, MCI_SDCR, RD4(sc, MCI_SDCR) | MCI_SDCR_SDCBUS); @@ -346,137 +559,247 @@ static void at91_mci_start_cmd(struct at91_mci_softc *sc, struct mmc_command *cmd) { - uint32_t cmdr, ier = 0, mr; - uint32_t *src, *dst; - int i; + uint32_t cmdr, mr; struct mmc_data *data; - void *vaddr; - bus_addr_t paddr; sc->curcmd = cmd; data = cmd->data; - cmdr = cmd->opcode; /* XXX Upper layers don't always set this */ cmd->mrq = sc->req; + /* Begin setting up command register. */ + + cmdr = cmd->opcode; + + if (sc->host.ios.bus_mode == opendrain) + cmdr |= MCI_CMDR_OPDCMD; + + /* Set up response handling. Allow max timeout for responses. */ + if (MMC_RSP(cmd->flags) == MMC_RSP_NONE) cmdr |= MCI_CMDR_RSPTYP_NO; else { - /* Allow big timeout for responses */ cmdr |= MCI_CMDR_MAXLAT; if (cmd->flags & MMC_RSP_136) cmdr |= MCI_CMDR_RSPTYP_136; else cmdr |= MCI_CMDR_RSPTYP_48; } - if (cmd->opcode == MMC_STOP_TRANSMISSION) - cmdr |= MCI_CMDR_TRCMD_STOP; - if (sc->host.ios.bus_mode == opendrain) - cmdr |= MCI_CMDR_OPDCMD; - if (!data) { - // The no data case is fairly simple + + /* If there is no data transfer, just set up the right interrupt mask + * and start the command. + * + * The interrupt mask needs to be CMDRDY plus all non-data-transfer + * errors. It's important to leave the transfer-related errors out, to + * avoid spurious timeout or crc errors on a STOP command following a + * multiblock read. When a multiblock read is in progress, sending a + * STOP in the middle of a block occasionally triggers such errors, but + * we're totally disinterested in them because we've already gotten all + * the data we wanted without error before sending the STOP command. + */ + + if (data == NULL) { + uint32_t ier = MCI_SR_CMDRDY | + MCI_SR_RTOE | MCI_SR_RENDE | + MCI_SR_RCRCE | MCI_SR_RDIRE | MCI_SR_RINDE; + at91_mci_pdc_disable(sc); -// printf("CMDR %x ARGR %x\n", cmdr, cmd->arg); + + if (cmd->opcode == MMC_STOP_TRANSMISSION) + cmdr |= MCI_CMDR_TRCMD_STOP; + + /* Ignore response CRC on CMD2 and ACMD41, per standard. */ + + if (cmd->opcode == MMC_SEND_OP_COND || + cmd->opcode == ACMD_SD_SEND_OP_COND) + ier &= ~MCI_SR_RCRCE; + + if (mci_debug) + printf("CMDR %x (opcode %d) ARGR %x no data\n", + cmdr, cmd->opcode, cmd->arg); + WR4(sc, MCI_ARGR, cmd->arg); WR4(sc, MCI_CMDR, cmdr); - WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_CMDRDY); + WR4(sc, MCI_IDR, 0xffffffff); + WR4(sc, MCI_IER, ier); return; } + + /* There is data, set up the transfer-related parts of the command. */ + if (data->flags & MMC_DATA_READ) cmdr |= MCI_CMDR_TRDIR; + if (data->flags & (MMC_DATA_READ | MMC_DATA_WRITE)) cmdr |= MCI_CMDR_TRCMD_START; + if (data->flags & MMC_DATA_STREAM) cmdr |= MCI_CMDR_TRTYP_STREAM; - if (data->flags & MMC_DATA_MULTI) + else if (data->flags & MMC_DATA_MULTI) { cmdr |= MCI_CMDR_TRTYP_MULTIPLE; - // Set block size and turn on PDC mode for dma xfer and disable - // PDC until we're ready. - mr = RD4(sc, MCI_MR) & ~MCI_MR_BLKLEN; - WR4(sc, MCI_MR, mr | (data->len << 16) | MCI_MR_PDCMODE); - WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS); - if (cmdr & MCI_CMDR_TRCMD_START) { - if (cmdr & MCI_CMDR_TRDIR) - vaddr = cmd->data->data; - else { - /* Use bounce buffer even if we don't need - * byteswap, since buffer may straddle a page - * boundry, and we don't handle multi-segment - * transfers in hardware. - * (page issues seen from 'bsdlabel -w' which - * uses raw geom access to the volume). - * Greg Ansley (gja (at) ansley.com) - */ - vaddr = sc->bounce_buffer; - src = (uint32_t *)cmd->data->data; - dst = (uint32_t *)vaddr; - if (sc->sc_cap & CAP_NEEDS_BYTESWAP) { - for (i = 0; i < data->len / 4; i++) - dst[i] = bswap32(src[i]); - } else - memcpy(dst, src, data->len); - } - data->xfer_len = 0; - if (bus_dmamap_load(sc->dmatag, sc->map, vaddr, data->len, - at91_mci_getaddr, &paddr, 0) != 0) { - cmd->error = MMC_ERR_NO_MEMORY; - sc->req = NULL; - sc->curcmd = NULL; - cmd->mrq->done(cmd->mrq); - return; - } - sc->mapped++; - if (cmdr & MCI_CMDR_TRDIR) { - bus_dmamap_sync(sc->dmatag, sc->map, BUS_DMASYNC_PREREAD); - WR4(sc, PDC_RPR, paddr); - WR4(sc, PDC_RCR, data->len / 4); - ier = MCI_SR_ENDRX; - } else { - bus_dmamap_sync(sc->dmatag, sc->map, BUS_DMASYNC_PREWRITE); - WR4(sc, PDC_TPR, paddr); - WR4(sc, PDC_TCR, data->len / 4); - ier = MCI_SR_TXBUFE; - } + sc->flags |= (data->flags & MMC_DATA_READ) ? + CMD_MULTIREAD : CMD_MULTIWRITE; } -// printf("CMDR %x ARGR %x with data\n", cmdr, cmd->arg); - WR4(sc, MCI_ARGR, cmd->arg); - if (cmdr & MCI_CMDR_TRCMD_START) { - if (cmdr & MCI_CMDR_TRDIR) { + + /* Disable PDC until we're ready. + * + * Set block size and turn on PDC mode for dma xfer. + * Note that the block size is the smaller of the amount of data to be + * transferred, or 512 bytes. The 512 size is fixed by the standard; + * smaller blocks are possible, but never larger. + */ + + WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS); + + mr = RD4(sc,MCI_MR) & ~MCI_MR_BLKLEN; + mr |= min(data->len, 512) << 16; + WR4(sc, MCI_MR, mr | MCI_MR_PDCMODE|MCI_MR_PDCPADV); + + /* Set up DMA. + * + * Use bounce buffers even if we don't need to byteswap, because doing + * multi-block IO with large DMA buffers is way fast (compared to + * single-block IO), even after incurring the overhead of also copying + * from/to the caller's buffers (which may be in non-contiguous physical + * pages). + * + * In an ideal non-byteswap world we could create a dma tag that allows + * for discontiguous segments and do the IO directly from/to the + * caller's buffer(s), using ENDRX/ENDTX interrupts to chain the + * discontiguous buffers through the PDC. Someday. + * + * If a read is bigger than 2k, split it in half so that we can start + * byte-swapping the first half while the second half is on the wire. + * It would be best if we could split it into 8k chunks, but we can't + * always keep up with the byte-swapping due to other system activity, + * and if an RXBUFF interrupt happens while we're still handling the + * byte-swap from the prior buffer (IE, we haven't returned from + * handling the prior interrupt yet), then data will get dropped on the + * floor and we can't easily recover from that. The right fix for that + * would be to have the interrupt handling only keep the DMA flowing and + * enqueue filled buffers to be byte-swapped in a non-interrupt context. + * Even that won't work on the write side of things though; in that + * context we have to have all the data ready to go before starting the + * dma. + * + * XXX what about stream transfers? + */ + + sc->xfer_offset = 0; + sc->bbuf_curidx = 0; + + if (data->flags & (MMC_DATA_READ | MMC_DATA_WRITE)) { + uint32_t len; + uint32_t remaining = data->len; + bus_addr_t paddr; + int err; + + if (remaining > (BBCOUNT*BBSIZE)) + panic("IO read size exceeds MAXDATA\n"); + + if (data->flags & MMC_DATA_READ) { + if (remaining > 2048) + len = remaining / 2; + else + len = remaining; + err = bus_dmamap_load(sc->dmatag, sc->bbuf_map[0], + sc->bbuf_vaddr[0], len, at91_mci_getaddr, + &paddr, BUS_DMA_NOWAIT); + if (err != 0) + panic("IO read dmamap_load failed\n"); + bus_dmamap_sync(sc->dmatag, sc->bbuf_map[0], + BUS_DMASYNC_PREREAD); + WR4(sc, PDC_RPR, paddr); + WR4(sc, PDC_RCR, len / 4); + sc->bbuf_len[0] = len; + remaining -= len; + if (remaining == 0) { + sc->bbuf_len[1] = 0; + } else { + len = remaining; + err = bus_dmamap_load(sc->dmatag, sc->bbuf_map[1], + sc->bbuf_vaddr[1], len, at91_mci_getaddr, + &paddr, BUS_DMA_NOWAIT); + if (err != 0) + panic("IO read dmamap_load failed\n"); + bus_dmamap_sync(sc->dmatag, sc->bbuf_map[1], + BUS_DMASYNC_PREREAD); + WR4(sc, PDC_RNPR, paddr); + WR4(sc, PDC_RNCR, len / 4); + sc->bbuf_len[1] = len; + remaining -= len; + } WR4(sc, PDC_PTCR, PDC_PTCR_RXTEN); - WR4(sc, MCI_CMDR, cmdr); } else { - WR4(sc, MCI_CMDR, cmdr); - WR4(sc, PDC_PTCR, PDC_PTCR_TXTEN); + len = min(BBSIZE, remaining); + at91_bswap_buf(sc, sc->bbuf_vaddr[0], data->data, len); + err = bus_dmamap_load(sc->dmatag, sc->bbuf_map[0], + sc->bbuf_vaddr[0], len, at91_mci_getaddr, + &paddr, BUS_DMA_NOWAIT); + if (err != 0) + panic("IO write dmamap_load failed\n"); + bus_dmamap_sync(sc->dmatag, sc->bbuf_map[0], + BUS_DMASYNC_PREWRITE); + WR4(sc, PDC_TPR,paddr); + WR4(sc, PDC_TCR, len / 4); + sc->bbuf_len[0] = len; + remaining -= len; + if (remaining == 0) { + sc->bbuf_len[1] = 0; + } else { + len = remaining; + at91_bswap_buf(sc, sc->bbuf_vaddr[1], + ((char *)data->data)+BBSIZE, len); + err = bus_dmamap_load(sc->dmatag, sc->bbuf_map[1], + sc->bbuf_vaddr[1], len, at91_mci_getaddr, + &paddr, BUS_DMA_NOWAIT); + if (err != 0) + panic("IO write dmamap_load failed\n"); + bus_dmamap_sync(sc->dmatag, sc->bbuf_map[1], + BUS_DMASYNC_PREWRITE); + WR4(sc, PDC_TNPR, paddr); + WR4(sc, PDC_TNCR, len / 4); + sc->bbuf_len[1] = len; + remaining -= len; + } + /* do not enable PDC xfer until CMDRDY asserted */ } + data->xfer_len = 0; /* XXX what's this? appears to be unused. */ } - WR4(sc, MCI_IER, MCI_SR_ERROR | ier); + + if (mci_debug) + printf("CMDR %x (opcode %d) ARGR %x with data len %d\n", + cmdr, cmd->opcode, cmd->arg, cmd->data->len); + + WR4(sc, MCI_ARGR, cmd->arg); + WR4(sc, MCI_CMDR, cmdr); + WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_CMDRDY); } static void -at91_mci_start(struct at91_mci_softc *sc) +at91_mci_next_operation(struct at91_mci_softc *sc) { struct mmc_request *req; req = sc->req; if (req == NULL) return; - // assert locked - if (!(sc->flags & CMD_STARTED)) { - sc->flags |= CMD_STARTED; -// printf("Starting CMD\n"); + + if (sc->flags & PENDING_CMD) { + sc->flags &= ~PENDING_CMD; at91_mci_start_cmd(sc, req->cmd); return; - } - if (!(sc->flags & STOP_STARTED) && req->stop) { -// printf("Starting Stop\n"); - sc->flags |= STOP_STARTED; + } else if (sc->flags & PENDING_STOP) { + sc->flags &= ~PENDING_STOP; at91_mci_start_cmd(sc, req->stop); return; } - /* We must be done -- bad idea to do this while locked? */ + + WR4(sc, MCI_IDR, 0xffffffff); sc->req = NULL; sc->curcmd = NULL; + //printf("req done\n"); req->done(req); } @@ -486,16 +809,16 @@ struct at91_mci_softc *sc = device_get_softc(brdev); AT91_MCI_LOCK(sc); - // XXX do we want to be able to queue up multiple commands? - // XXX sounds like a good idea, but all protocols are sync, so - // XXX maybe the idea is naive... if (sc->req != NULL) { AT91_MCI_UNLOCK(sc); return (EBUSY); } + //printf("new req\n"); sc->req = req; - sc->flags = 0; - at91_mci_start(sc); + sc->flags = PENDING_CMD; + if (sc->req->stop) + sc->flags |= PENDING_STOP; + at91_mci_next_operation(sc); AT91_MCI_UNLOCK(sc); return (0); } @@ -533,120 +856,341 @@ } static void -at91_mci_read_done(struct at91_mci_softc *sc) +at91_mci_read_done(struct at91_mci_softc *sc, uint32_t sr) { - uint32_t *walker; - struct mmc_command *cmd; - int i, len; - - cmd = sc->curcmd; - bus_dmamap_sync(sc->dmatag, sc->map, BUS_DMASYNC_POSTREAD); - bus_dmamap_unload(sc->dmatag, sc->map); - sc->mapped--; - if (sc->sc_cap & CAP_NEEDS_BYTESWAP) { - walker = (uint32_t *)cmd->data->data; - len = cmd->data->len / 4; - for (i = 0; i < len; i++) - walker[i] = bswap32(walker[i]); - } - // Finish up the sequence... - WR4(sc, MCI_IDR, MCI_SR_ENDRX); - WR4(sc, MCI_IER, MCI_SR_RXBUFF); - WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS); + struct mmc_command *cmd = sc->curcmd; + char * dataptr = (char *)cmd->data->data; + uint32_t curidx = sc->bbuf_curidx; + uint32_t len = sc->bbuf_len[curidx]; + + /* We arrive here when a DMA transfer for a read is done, whether it's a + * single or multi-block read. + * + * We byte-swap the buffer that just completed, and if that is the last + * buffer that's part of this read then we move on to the next + * operation, otherwise we wait for another ENDRX for the next bufer. + */ + + bus_dmamap_sync(sc->dmatag, sc->bbuf_map[curidx], BUS_DMASYNC_POSTREAD); + bus_dmamap_unload(sc->dmatag, sc->bbuf_map[curidx]); + + at91_bswap_buf(sc, dataptr + sc->xfer_offset, sc->bbuf_vaddr[curidx], len); + + if (mci_debug) { + printf("read done sr %x curidx %d len %d xfer_offset %d\n", + sr, curidx, len, sc->xfer_offset); + } + + sc->xfer_offset += len; + sc->bbuf_curidx = !curidx; /* swap buffers */ + + /* If we've transferred all the data, move on to the next operation. + * + * If we're still transferring the last buffer, RNCR is already zero but + * we have to write a zero anyway to clear the ENDRX status so we don't + * re-interrupt until the last buffer is done. + */ + + if (sc->xfer_offset == cmd->data->len) { + WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS); + cmd->error = MMC_ERR_NONE; + at91_mci_next_operation(sc); + } else { + WR4(sc, PDC_RNCR, 0); + WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_ENDRX); + } } static void -at91_mci_xmit_done(struct at91_mci_softc *sc) +at91_mci_write_done(struct at91_mci_softc *sc, uint32_t sr) { - // Finish up the sequence... + struct mmc_command *cmd = sc->curcmd; + + /* We arrive here when the entire DMA transfer for a write is done, + * whether it's a single or multi-block write. If it's multi-block we + * have to immediately move on to the next operation which is to send + * the stop command. If it's a single-block transfer we need to wait + * for NOTBUSY, but if that's already asserted we can avoid another + * interrupt and just move on to completing the request right away. + */ + WR4(sc, PDC_PTCR, PDC_PTCR_RXTDIS | PDC_PTCR_TXTDIS); - WR4(sc, MCI_IDR, MCI_SR_TXBUFE); - WR4(sc, MCI_IER, MCI_SR_NOTBUSY); - bus_dmamap_sync(sc->dmatag, sc->map, BUS_DMASYNC_POSTWRITE); - bus_dmamap_unload(sc->dmatag, sc->map); - sc->mapped--; + + bus_dmamap_sync(sc->dmatag, sc->bbuf_map[sc->bbuf_curidx], BUS_DMASYNC_POSTWRITE); + bus_dmamap_unload(sc->dmatag, sc->bbuf_map[sc->bbuf_curidx]); + + if ((cmd->data->flags & MMC_DATA_MULTI) || (sr & MCI_SR_NOTBUSY)) { + cmd->error = MMC_ERR_NONE; + at91_mci_next_operation(sc); + } else { + WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_NOTBUSY); + } +} + +static void +at91_mci_notbusy(struct at91_mci_softc *sc) +{ + struct mmc_command *cmd = sc->curcmd; + + /* We arrive here by either completion of a single-block write, or + * completion of the stop command that ended a multi-block write (and, I + * suppose, after a card-select or erase, but I haven't tested those). + * Anyway, we're done and it's time to move on to the next command. + */ + + cmd->error = MMC_ERR_NONE; + at91_mci_next_operation(sc); +} + +static void +at91_mci_stop_done(struct at91_mci_softc *sc, uint32_t sr) +{ + struct mmc_command *cmd = sc->curcmd; + + /* We arrive here after receiving CMDRDY for a MMC_STOP_TRANSMISSION + * command. Depending on the operation being stopped, we may have to do + * some unusual things to work around hardware bugs. + */ + + /* This is known to be true of at91rm9200 hardware; it may or may not + * apply to more recent chips: + * + * After stopping a multi-block write, the NOTBUSY bit in MCI_SR does + * not properly reflect the actual busy state of the card as signaled on + * the DAT0 line; it always claims the card is not-busy. If we believe + * that and let operations continue, following commands will fail with + * response timeouts (except of course MMC_SEND_STATUS -- it indicates + * the card is busy in the PRG state, which was the smoking gun that + * showed MCI_SR NOTBUSY was not tracking DAT0 correctly). + * + * The atmel docs are emphatic: "This flag [NOTBUSY] must be used only + * for Write Operations." I guess technically since we sent a stop it's + * not a write operation anymore. But then just what did they think it + * meant for the stop command to have "...an optional busy signal + * transmitted on the data line" according to the SD spec? + * + * I tried a variety of things to un-wedge the MCI and get the status + * register to reflect NOTBUSY correctly again, but the only thing that + * worked was a full device reset. It feels like an awfully big hammer, + * but doing a full reset after every multiblock write is still faster + * than doing single-block IO (by almost two orders of magnitude: + * 20KB/sec improves to about 1.8MB/sec best case). + * + * After doing the reset, wait for a NOTBUSY interrupt before continuing + * with the next operation. + */ + + if (sc->flags & CMD_MULTIWRITE) { + at91_mci_reset(sc); + WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_NOTBUSY); + return; + } + + /* This is known to be true of at91rm9200 hardware; it may or may not + * apply to more recent chips: + * + * After stopping a multi-block read, loop to read and discard any data + * that coasts in after we sent the stop command. The docs don't say + * anything about it, but empirical testing shows that 1-3 additional + * words of data get buffered up in some unmentioned internal fifo and + * if we don't read and discard them here they end up on the front of + * the next read DMA transfer we do. + */ + + if (sc->flags & CMD_MULTIREAD) { + uint32_t sr; + int count = 0; + do { + sr = RD4(sc, MCI_SR); + if (sr & MCI_SR_RXRDY) { + RD4(sc, MCI_RDR); + ++count; + } + } while (sr & MCI_SR_RXRDY); + at91_mci_reset(sc); +// if (count != 0) +// printf("Had to soak up %d words after read\n", count); + } + + cmd->error = MMC_ERR_NONE; + at91_mci_next_operation(sc); + +} + +static void +at91_mci_cmdrdy(struct at91_mci_softc *sc, uint32_t sr) +{ + struct mmc_command *cmd = sc->curcmd; + int i; + + if (cmd == NULL) + return; + + /* We get here at the end of EVERY command. We retrieve the command + * response (if any) then decide what to do next based on the command. + */ + + if (cmd->flags & MMC_RSP_PRESENT) { + for (i = 0; i < ((cmd->flags & MMC_RSP_136) ? 4 : 1); i++) { + cmd->resp[i] = RD4(sc, MCI_RSPR + i * 4); + if (mci_debug) + printf("RSPR[%d] = %x sr=%x\n", i, cmd->resp[i], sr); + } + } + + /* If this was a stop command, go handle the various special + * conditions (read: bugs) that have to be dealt with following a stop. + */ + + if (cmd->opcode == MMC_STOP_TRANSMISSION) { + at91_mci_stop_done(sc, sr); + return; + } + + /* If this command can continue to assert BUSY beyond the response then + * we need to wait for NOTBUSY before the command is really done. + * + * Note that this may not work properly on the at91rm9200. It certainly + * doesn't work for the STOP command that follows a multi-block write, + * so post-stop CMDRDY is handled separately; see the special handling + * in at91_mci_stop_done(). + * + * Beside STOP, there are other R1B-type commands that use the busy + * signal after CMDRDY: CMD7 (card select), CMD28-29 (write protect), + * CMD38 (erase). I haven't tested any of them, but I rather expect + * them all to have the same sort of problem with MCI_SR not actually + * reflecting the state of the DAT0-line busy indicator. So this code + * may need to grow some sort of special handling for them too. (This + * just in: CMD7 isn't a problem right now because dev/mmc.c incorrectly + * sets the response flags to R1 rather than R1B.) + */ + + if ((cmd->flags & MMC_RSP_BUSY)) { + WR4(sc, MCI_IER, MCI_SR_ERROR | MCI_SR_NOTBUSY); + return; + } + + /* If there is a data transfer with this command, then... + * - If it's a read, we need to wait for ENDRX. + * - If it's a write, now is the time to enable the PDC, and we need to + * wait for a BLKE that follows a TXBUFE, because if we're doing a + * split transfer we get a BLKE after the first half (when TPR/TCR get + * loaded from TNPR/TNCR). So first we wait for the TXBUFE, and the + * handling for that interrupt will then invoke the wait for the + * subsequent BLKE which indicates actual completion. + */ + + if (cmd->data) { + uint32_t ier; + if (cmd->data->flags & MMC_DATA_READ) { + ier = MCI_SR_ENDRX; + } else { + ier = MCI_SR_TXBUFE; + WR4(sc, PDC_PTCR, PDC_PTCR_TXTEN); + } + WR4(sc, MCI_IER, MCI_SR_ERROR | ier); + return; + } + + /* If we made it to here, we don't need to wait for anything more for + * the current command, move on to the next command (will complete the + * request if there is no next command). + */ + + cmd->error = MMC_ERR_NONE; + at91_mci_next_operation(sc); } static void at91_mci_intr(void *arg) { struct at91_mci_softc *sc = (struct at91_mci_softc*)arg; - uint32_t sr; - int i, done = 0; - struct mmc_command *cmd; + struct mmc_command *cmd = sc->curcmd; + uint32_t sr, isr; AT91_MCI_LOCK(sc); - sr = RD4(sc, MCI_SR) & RD4(sc, MCI_IMR); -// printf("i 0x%x\n", sr); - cmd = sc->curcmd; - if (sr & MCI_SR_ERROR) { - // Ignore CRC errors on CMD2 and ACMD47, per relevant standards - if ((sr & MCI_SR_RCRCE) && (cmd->opcode == MMC_SEND_OP_COND || - cmd->opcode == ACMD_SD_SEND_OP_COND)) - cmd->error = MMC_ERR_NONE; - else if (sr & (MCI_SR_RTOE | MCI_SR_DTOE)) + + sr = RD4(sc, MCI_SR); + isr = sr & RD4(sc, MCI_IMR); + + if (mci_debug) + printf("i 0x%x sr 0x%x\n", isr, sr); + + /* All interrupts are one-shot; disable it now. + * The next operation will re-enable whatever interrupts it wants. + */ + + WR4(sc, MCI_IDR, isr); + + if (isr & MCI_SR_ERROR) { + if (isr & (MCI_SR_RTOE | MCI_SR_DTOE)) cmd->error = MMC_ERR_TIMEOUT; - else if (sr & (MCI_SR_RCRCE | MCI_SR_DCRCE)) + else if (isr & (MCI_SR_RCRCE | MCI_SR_DCRCE)) cmd->error = MMC_ERR_BADCRC; - else if (sr & (MCI_SR_OVRE | MCI_SR_UNRE)) + else if (isr & (MCI_SR_OVRE | MCI_SR_UNRE)) cmd->error = MMC_ERR_FIFO; else cmd->error = MMC_ERR_FAILED; - done = 1; - if (sc->mapped && cmd->error) { - bus_dmamap_unload(sc->dmatag, sc->map); - sc->mapped--; + /* CMD8 is used to probe for SDHC cards, a standard SD card will + * get a response timeout; don't report it because it's a normal + * and expected condition. One might argue that all error + * reporting should be left to higher levels, but when they + * report at all it's always EIO, which isn't very helpful. + */ + if (cmd->opcode != 8) { + device_printf(sc->dev, + "IO error; status MCI_SR = 0x%x cmd opcode = %d%s\n", + sr, cmd->opcode, + (cmd->opcode != 12) ? "" : + (sc->flags & CMD_MULTIREAD) ? " after read" : " after write"); + at91_mci_reset(sc); } + at91_mci_next_operation(sc); } else { - if (sr & MCI_SR_TXBUFE) { + if (isr & MCI_SR_TXBUFE) { // printf("TXBUFE\n"); - at91_mci_xmit_done(sc); + /* We need to wait for a BLKE that follows TXBUFE + * (intermediate BLKEs might happen after ENDTXes if + * we're chaining multiple buffers). If BLKE is also + * asserted at the time we get TXBUFE, we can avoid + * another interrupt and process it right away, below. + */ + if (sr & MCI_SR_BLKE) + isr |= MCI_SR_BLKE; + else + WR4(sc, MCI_IER, MCI_SR_BLKE); } - if (sr & MCI_SR_RXBUFF) { + if (isr & MCI_SR_RXBUFF) { // printf("RXBUFF\n"); - WR4(sc, MCI_IDR, MCI_SR_RXBUFF); - WR4(sc, MCI_IER, MCI_SR_CMDRDY); } - if (sr & MCI_SR_ENDTX) { + if (isr & MCI_SR_ENDTX) { // printf("ENDTX\n"); } - if (sr & MCI_SR_ENDRX) { + if (isr & MCI_SR_ENDRX) { // printf("ENDRX\n"); - at91_mci_read_done(sc); + at91_mci_read_done(sc, sr); } - if (sr & MCI_SR_NOTBUSY) { + if (isr & MCI_SR_NOTBUSY) { // printf("NOTBUSY\n"); - WR4(sc, MCI_IDR, MCI_SR_NOTBUSY); - WR4(sc, MCI_IER, MCI_SR_CMDRDY); + at91_mci_notbusy(sc); } - if (sr & MCI_SR_DTIP) { + if (isr & MCI_SR_DTIP) { // printf("Data transfer in progress\n"); } - if (sr & MCI_SR_BLKE) { + if (isr & MCI_SR_BLKE) { // printf("Block transfer end\n"); + at91_mci_write_done(sc, sr); } - if (sr & MCI_SR_TXRDY) { + if (isr & MCI_SR_TXRDY) { // printf("Ready to transmit\n"); } - if (sr & MCI_SR_RXRDY) { + if (isr & MCI_SR_RXRDY) { // printf("Ready to receive\n"); } - if (sr & MCI_SR_CMDRDY) { + if (isr & MCI_SR_CMDRDY) { // printf("Command ready\n"); - done = 1; - cmd->error = MMC_ERR_NONE; - } - } - if (done) { - WR4(sc, MCI_IDR, 0xffffffff); - if (cmd != NULL && (cmd->flags & MMC_RSP_PRESENT)) { - for (i = 0; i < ((cmd->flags & MMC_RSP_136) ? 4 : 1); - i++) { - cmd->resp[i] = RD4(sc, MCI_RSPR + i * 4); -// printf("RSPR[%d] = %x\n", i, cmd->resp[i]); - } + at91_mci_cmdrdy(sc, sr); } - at91_mci_start(sc); } AT91_MCI_UNLOCK(sc); } @@ -703,7 +1247,7 @@ *(int *)result = sc->host.caps; break; case MMCBR_IVAR_MAX_DATA: - *(int *)result = 1; + *(int *)result = MAX_BLOCKS; break; } return (0);