Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 19 Jun 2016 03:45:32 +0000 (UTC)
From:      Adrian Chadd <adrian@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r302017 - head/sys/dev/ath
Message-ID:  <201606190345.u5J3jWKi079768@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: adrian
Date: Sun Jun 19 03:45:32 2016
New Revision: 302017
URL: https://svnweb.freebsd.org/changeset/base/302017

Log:
  [ath] add support for batching frames to the general TX queues.
  
  It turns out the frame scheduling policies (eg DBA_GATED) operate on
  a single TX FIFO entry.  ASAP scheduling is fine; those frames always
  go out.
  
  DBA-gated sets the TX queue ready when the DBA timer fires, which triggers
  a beacon transmit.  Normally this is used for content-after-beacon queue
  (CABQ) work, which needs to burst out immediately after a beacon.
  (eg broadcast, multicast, etc frames.)  This is a general policy that you
  can use for any queue, and Sam's TDMA code uses it.
  
  When DBA_GATED is used and something like say, an 11e TX burst window,
  it only operates on a single TX FIFO entry.  If you have a single frame
  per TX FIFO entry and say, a 2.5ms long burst window (eg TDMA!) then it'll
  only burst a single frame every 2.5ms.  If there's no gating (eg ASAP) then
  the burst window is fine, and multiple TX FIFO slots get used.
  
  The CABQ code does pack in a list of frames (ie, the whole cabq) but
  up until this commit, the normal TX queues didn't.  It showed up when
  I started to debug TDMA on the AR9380 and later.
  
  This commit doesn't fix the TDMA case - that's still broken here, because
  all I'm doing here is allowing 'some' frames to be bursting, but I'm
  certainly not filling the whole TX FIFO slot entry with frames.
  Doing that 'properly' kind of requires me to take into account how long
  packets should take to transmit and say, doing 1.5 or something times that
  per TX FIFO slot, as if you partially transmit a slot, when it's next
  gated it'll just finish that TX FIFO slot, then not advance to the next
  one.
  
  Now, I /also/ think queuing a new packet restarts DMA, but you have to
  push new frames into the TX FIFO.  I need to experiment some more with
  this because if it's really the case, I will be able to do TDMA support
  without the egregious hacks I have in my local tree.  Sam's TDMA code
  for previous chips would just kick the TXE bit to push along DMA
  again, but we can't do that for EDMA chips - we /have/ to push a new
  frame into the TX FIFO to restart DMA.  Ugh.
  
  Tested:
  
  * AR9380, STA mode
  * AR9380, hostap mode
  * AR9580, hostap mode
  
  Approved by:	re (gjb)

Modified:
  head/sys/dev/ath/if_ath_tx_edma.c

Modified: head/sys/dev/ath/if_ath_tx_edma.c
==============================================================================
--- head/sys/dev/ath/if_ath_tx_edma.c	Sun Jun 19 03:44:32 2016	(r302016)
+++ head/sys/dev/ath/if_ath_tx_edma.c	Sun Jun 19 03:45:32 2016	(r302017)
@@ -138,79 +138,186 @@ MALLOC_DECLARE(M_ATHDEV);
 
 static void ath_edma_tx_processq(struct ath_softc *sc, int dosched);
 
-/*
- * Push some frames into the TX FIFO if we have space.
- */
+#ifdef	ATH_DEBUG_ALQ
 static void
-ath_edma_tx_fifo_fill(struct ath_softc *sc, struct ath_txq *txq)
+ath_tx_alq_edma_push(struct ath_softc *sc, int txq, int nframes,
+    int fifo_depth, int frame_cnt)
+{
+	struct if_ath_alq_tx_fifo_push aq;
+
+	aq.txq = htobe32(txq);
+	aq.nframes = htobe32(nframes);
+	aq.fifo_depth = htobe32(fifo_depth);
+	aq.frame_cnt = htobe32(frame_cnt);
+
+	if_ath_alq_post(&sc->sc_alq, ATH_ALQ_TX_FIFO_PUSH,
+	    sizeof(aq),
+	    (const char *) &aq);
+}
+#endif	/* ATH_DEBUG_ALQ */
+
+static void
+ath_tx_edma_push_staging_list(struct ath_softc *sc, struct ath_txq *txq,
+    int limit)
 {
 	struct ath_buf *bf, *bf_last;
-	int i = 0;
+	struct ath_buf *bfi, *bfp;
+	int i, sqdepth;
+	TAILQ_HEAD(axq_q_f_s, ath_buf)  sq;
 
 	ATH_TXQ_LOCK_ASSERT(txq);
 
-	DPRINTF(sc, ATH_DEBUG_TX_PROC, "%s: Q%d: called\n",
-	    __func__,
-	    txq->axq_qnum);
+	/*
+	 * Don't bother doing any work if it's full.
+	 */
+	if (txq->axq_fifo_depth >= HAL_TXFIFO_DEPTH)
+		return;
 
-	TAILQ_FOREACH(bf, &txq->axq_q, bf_list) {
-		if (txq->axq_fifo_depth >= HAL_TXFIFO_DEPTH)
-			break;
+	if (TAILQ_EMPTY(&txq->axq_q))
+		return;
 
-		/*
-		 * We have space in the FIFO - so let's push a frame
-		 * into it.
-		 */
+	TAILQ_INIT(&sq);
 
-		/*
-		 * Remove it from the normal list
-		 */
+	/*
+	 * First pass - walk sq, queue up to 'limit' entries,
+	 * subtract them from the staging queue.
+	 */
+	sqdepth = 0;
+	for (i = 0; i < limit; i++) {
+		/* Grab the head entry */
+		bf = ATH_TXQ_FIRST(txq);
+		if (bf == NULL)
+			break;
 		ATH_TXQ_REMOVE(txq, bf, bf_list);
 
-		/*
-		 * XXX for now, we only dequeue a frame at a time, so
-		 * that's only one buffer.  Later on when we just
-		 * push this staging _list_ into the queue, we'll
-		 * set bf_last to the end pointer in the list.
-		 */
-		bf_last = bf;
-		DPRINTF(sc, ATH_DEBUG_TX_PROC,
-		    "%s: Q%d: depth=%d; pushing %p->%p\n",
-		    __func__,
-		    txq->axq_qnum,
-		    txq->axq_fifo_depth,
-		    bf,
-		    bf_last);
+		/* Queue it into our staging list */
+		TAILQ_INSERT_TAIL(&sq, bf, bf_list);
+		sqdepth++;
+	}
 
-		/*
-		 * Append it to the FIFO staging list
-		 */
-		ATH_TXQ_INSERT_TAIL(&txq->fifo, bf, bf_list);
+	/*
+	 * Ok, so now we have a staging list of up to 'limit'
+	 * frames from the txq.  Now let's wrap that up
+	 * into its own list and pass that to the hardware
+	 * as one FIFO entry.
+	 */
 
-		/*
-		 * Set fifo start / fifo end flags appropriately
-		 *
-		 */
-		bf->bf_flags |= ATH_BUF_FIFOPTR;
-		bf_last->bf_flags |= ATH_BUF_FIFOEND;
+	bf = TAILQ_FIRST(&sq);
+	bf_last = TAILQ_LAST(&sq, axq_q_s);
 
-		/*
-		 * Push _into_ the FIFO.
-		 */
-		ath_hal_puttxbuf(sc->sc_ah, txq->axq_qnum, bf->bf_daddr);
+	/*
+	 * Ok, so here's the gymnastics reqiured to make this
+	 * all sensible.
+	 */
+
+	/*
+	 * Tag the first/last buffer appropriately.
+	 */
+	bf->bf_flags |= ATH_BUF_FIFOPTR;
+	bf_last->bf_flags |= ATH_BUF_FIFOEND;
+
+	/*
+	 * Walk the descriptor list and link them appropriately.
+	 */
+	bfp = NULL;
+	TAILQ_FOREACH(bfi, &sq, bf_list) {
+		if (bfp != NULL) {
+			ath_hal_settxdesclink(sc->sc_ah, bfp->bf_lastds,
+			    bfi->bf_daddr);
+		}
+		bfp = bfi;
+	}
+
+	i = 0;
+	TAILQ_FOREACH(bfi, &sq, bf_list) {
 #ifdef	ATH_DEBUG
 		if (sc->sc_debug & ATH_DEBUG_XMIT_DESC)
-			ath_printtxbuf(sc, bf, txq->axq_qnum, i, 0);
+			ath_printtxbuf(sc, bfi, txq->axq_qnum, i, 0);
 #endif/* ATH_DEBUG */
 #ifdef	ATH_DEBUG_ALQ
 		if (if_ath_alq_checkdebug(&sc->sc_alq, ATH_ALQ_EDMA_TXDESC))
-			ath_tx_alq_post(sc, bf);
+			ath_tx_alq_post(sc, bfi);
 #endif /* ATH_DEBUG_ALQ */
-		txq->axq_fifo_depth++;
 		i++;
 	}
-	if (i > 0)
-		ath_hal_txstart(sc->sc_ah, txq->axq_qnum);
+
+	/*
+	 * We now need to push this set of frames onto the tail
+	 * of the FIFO queue.  We don't adjust the aggregate
+	 * count, only the queue depth counter(s).
+	 * We also need to blank the link pointer now.
+	 */
+
+	TAILQ_CONCAT(&txq->fifo.axq_q, &sq, bf_list);
+	/* Bump total queue tracking in FIFO queue */
+	txq->fifo.axq_depth += sqdepth;
+
+	/* Bump FIFO queue */
+	txq->axq_fifo_depth++;
+	DPRINTF(sc, ATH_DEBUG_XMIT,
+	    "%s: queued %d packets; depth=%d, fifo depth=%d\n",
+	    __func__, sqdepth, txq->fifo.axq_depth, txq->axq_fifo_depth);
+
+	/* Push the first entry into the hardware */
+	ath_hal_puttxbuf(sc->sc_ah, txq->axq_qnum, bf->bf_daddr);
+
+	/* Push start on the DMA if it's not already started */
+	ath_hal_txstart(sc->sc_ah, txq->axq_qnum);
+
+#ifdef	ATH_DEBUG_ALQ
+	ath_tx_alq_edma_push(sc, txq->axq_qnum, sqdepth,
+	    txq->axq_fifo_depth,
+	    txq->fifo.axq_depth);
+#endif /* ATH_DEBUG_ALQ */
+}
+
+/*
+ * Push some frames into the TX FIFO if we have space.
+ */
+static void
+ath_edma_tx_fifo_fill(struct ath_softc *sc, struct ath_txq *txq)
+{
+
+	ATH_TXQ_LOCK_ASSERT(txq);
+
+	DPRINTF(sc, ATH_DEBUG_TX_PROC, "%s: Q%d: called\n",
+	    __func__,
+	    txq->axq_qnum);
+
+	/*
+	 * For now, push up to 4 frames per TX FIFO slot.
+	 * If more are in the hardware queue then they'll
+	 * get populated when we try to send another frame
+	 * or complete a frame - so at most there'll be
+	 * 32 non-AMPDU frames per TXQ.
+	 *
+	 * Note that the hardware staging queue will limit
+	 * how many frames in total we will have pushed into
+	 * here.
+	 *
+	 * Later on, we'll want to push less frames into
+	 * the TX FIFO since we don't want to necessarily
+	 * fill tens or hundreds of milliseconds of potential
+	 * frames.
+	 *
+	 * However, we need more frames right now because of
+	 * how the MAC implements the frame scheduling policy.
+	 * It only ungates a single FIFO entry at a time,
+	 * and will run that until CHNTIME expires or the
+	 * end of that FIFO entry descriptor list is reached.
+	 * So for TDMA we suffer a big performance penalty -
+	 * single TX FIFO entries mean the MAC only sends out
+	 * one frame per DBA event, which turned out on average
+	 * 6ms per TX frame.
+	 *
+	 * So, for aggregates it's okay - it'll push two at a
+	 * time and this will just do them more efficiently.
+	 * For non-aggregates it'll do 4 at a time, up to the
+	 * non-aggr limit (non_aggr, which is 32.)  They should
+	 * be time based rather than a hard count, but I also
+	 * do need sleep.
+	 */
+	ath_tx_edma_push_staging_list(sc, txq, 4);
 }
 
 /*



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201606190345.u5J3jWKi079768>