Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 1 Oct 2025 22:57:12 GMT
From:      Andrew Gallatin <gallatin@FreeBSD.org>
To:        src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org
Subject:   git: 213170eb956f - main - iflib: Implement tx desc reclaim threshold
Message-ID:  <202510012257.591MvCth033595@gitrepo.freebsd.org>

next in thread | raw e-mail | index | archive | help
The branch main has been updated by gallatin:

URL: https://cgit.FreeBSD.org/src/commit/?id=213170eb956fa7eec93c2ab4159d6ed68e8e1b1d

commit 213170eb956fa7eec93c2ab4159d6ed68e8e1b1d
Author:     Andrew Gallatin <gallatin@FreeBSD.org>
AuthorDate: 2025-10-01 22:43:05 +0000
Commit:     Andrew Gallatin <gallatin@FreeBSD.org>
CommitDate: 2025-10-01 22:55:00 +0000

    iflib: Implement tx desc reclaim threshold
    
    On some iflib drivers, the txd reclaim routine can be fairly expensive
    at high packet rates.  Iflib was designed with the intent of only
    reclaiming tx descriptors above a configurable threshold, but this
    logic was left unimplemented.
    
    This change:
    
       - implements 2 new knobs, iflib.tx_reclaim_thresh and
          iflib.tx_reclaim_ticks.
       - moves tx reclaim thresh from the if_shared_ctx and into the
          iflib_ctx as drivers don't need to see it, and it needs to be
          changed, so it can't be const
       - tx_reclaim_thresh and ticks are replicated into the txq to
          improve cache locality of data accessed in the hot path
        - ticks is used rather than more expensive timekeeping mechanism so
           as to keep things simple and cheap
    
    This change substantially improves packet rates on bnxt. It has been
    tested on bxnt and ixl
    
    Sponsored by: Netflix
    Differential Revision: https://reviews.freebsd.org/D52561
    Reviewed by: markj (initial version)
---
 sys/net/iflib.c | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++------
 sys/net/iflib.h |   2 +-
 2 files changed, 101 insertions(+), 13 deletions(-)

diff --git a/sys/net/iflib.c b/sys/net/iflib.c
index e2005aa28c5c..d2625da19cd2 100644
--- a/sys/net/iflib.c
+++ b/sys/net/iflib.c
@@ -202,6 +202,8 @@ struct iflib_ctx {
 	uint16_t ifc_sysctl_extra_msix_vectors;
 	bool     ifc_cpus_are_physical_cores;
 	bool     ifc_sysctl_simple_tx;
+	uint16_t ifc_sysctl_tx_reclaim_thresh;
+	uint16_t ifc_sysctl_tx_reclaim_ticks;
 
 	qidx_t ifc_sysctl_ntxds[8];
 	qidx_t ifc_sysctl_nrxds[8];
@@ -345,7 +347,9 @@ struct iflib_txq {
 	uint16_t	ift_npending;
 	uint16_t	ift_db_pending;
 	uint16_t	ift_rs_pending;
-	/* implicit pad */
+	uint32_t	ift_last_reclaim;
+	uint16_t	ift_reclaim_thresh;
+	uint16_t	ift_reclaim_ticks;
 	uint8_t		ift_txd_size[8];
 	uint64_t	ift_processed;
 	uint64_t	ift_cleaned;
@@ -729,7 +733,7 @@ static void iflib_free_intr_mem(if_ctx_t ctx);
 #ifndef __NO_STRICT_ALIGNMENT
 static struct mbuf *iflib_fixup_rx(struct mbuf *m);
 #endif
-static __inline int iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh);
+static __inline int iflib_completed_tx_reclaim(iflib_txq_t txq);
 
 static SLIST_HEAD(cpu_offset_list, cpu_offset) cpu_offsets =
     SLIST_HEAD_INITIALIZER(cpu_offsets);
@@ -3084,8 +3088,6 @@ txq_max_rs_deferred(iflib_txq_t txq)
 #define QIDX(ctx, m) ((((m)->m_pkthdr.flowid & ctx->ifc_softc_ctx.isc_rss_table_mask) % NTXQSETS(ctx)) + FIRST_QSET(ctx))
 #define DESC_RECLAIMABLE(q) ((int)((q)->ift_processed - (q)->ift_cleaned - (q)->ift_ctx->ifc_softc_ctx.isc_tx_nsegments))
 
-/* XXX we should be setting this to something other than zero */
-#define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh)
 #define	MAX_TX_DESC(ctx) MAX((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max, \
     (ctx)->ifc_softc_ctx.isc_tx_nsegments)
 
@@ -3642,7 +3644,7 @@ defrag:
 	 *        cxgb
 	 */
 	if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) {
-		(void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
+		(void)iflib_completed_tx_reclaim(txq);
 		if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) {
 			txq->ift_no_desc_avail++;
 			bus_dmamap_unload(buf_tag, map);
@@ -3785,14 +3787,21 @@ iflib_tx_desc_free(iflib_txq_t txq, int n)
 }
 
 static __inline int
-iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh)
+iflib_completed_tx_reclaim(iflib_txq_t txq)
 {
-	int reclaim;
+	int reclaim, thresh;
+	uint32_t now;
 	if_ctx_t ctx = txq->ift_ctx;
 
+	thresh = txq->ift_reclaim_thresh;
 	KASSERT(thresh >= 0, ("invalid threshold to reclaim"));
 	MPASS(thresh /*+ MAX_TX_DESC(txq->ift_ctx) */ < txq->ift_size);
 
+	now = ticks;
+	if (now <= (txq->ift_last_reclaim + txq->ift_reclaim_ticks) &&
+	    txq->ift_in_use < thresh)
+		return (0);
+	txq->ift_last_reclaim = now;
 	/*
 	 * Need a rate-limiting check so that this isn't called every time
 	 */
@@ -3873,7 +3882,7 @@ iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
 		DBG_COUNTER_INC(txq_drain_notready);
 		return (0);
 	}
-	reclaimed = iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
+	reclaimed = iflib_completed_tx_reclaim(txq);
 	rang = iflib_txd_db_check(txq, reclaimed && txq->ift_db_pending);
 	avail = IDXDIFF(pidx, cidx, r->size);
 
@@ -3952,7 +3961,7 @@ iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx)
 	}
 
 	/* deliberate use of bitwise or to avoid gratuitous short-circuit */
-	ring = rang ? false  : (iflib_min_tx_latency | err);
+	ring = rang ? false  : (iflib_min_tx_latency | err | (!!txq->ift_reclaim_thresh));
 	iflib_txd_db_check(txq, ring);
 	if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent);
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent);
@@ -4032,7 +4041,7 @@ _task_fn_tx(void *context)
 #endif
         if (ctx->ifc_sysctl_simple_tx) {
                 mtx_lock(&txq->ift_mtx);
-                (void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
+                (void)iflib_completed_tx_reclaim(txq);
                 mtx_unlock(&txq->ift_mtx);
                 goto skip_ifmp;
         }
@@ -5883,6 +5892,7 @@ iflib_queues_alloc(if_ctx_t ctx)
 			device_printf(dev, "Unable to allocate buf_ring\n");
 			goto err_tx_desc;
 		}
+		txq->ift_reclaim_thresh = ctx->ifc_sysctl_tx_reclaim_thresh;
 	}
 
 	for (rxconf = i = 0; i < nrxqsets; i++, rxconf++, rxq++) {
@@ -6774,6 +6784,74 @@ mp_ndesc_handler(SYSCTL_HANDLER_ARGS)
 	return (rc);
 }
 
+static int
+iflib_handle_tx_reclaim_thresh(SYSCTL_HANDLER_ARGS)
+{
+	if_ctx_t ctx = (void *)arg1;
+	iflib_txq_t txq;
+	int i, err;
+	int thresh;
+
+	thresh = ctx->ifc_sysctl_tx_reclaim_thresh;
+	err = sysctl_handle_int(oidp, &thresh, arg2, req);
+	if (err != 0) {
+		return err;
+	}
+
+	if (thresh == ctx->ifc_sysctl_tx_reclaim_thresh)
+		return 0;
+
+	if (thresh > ctx->ifc_softc_ctx.isc_ntxd[0] / 2) {
+		device_printf(ctx->ifc_dev, "TX Reclaim thresh must be <= %d\n",
+		    ctx->ifc_softc_ctx.isc_ntxd[0] / 2);
+		return (EINVAL);
+	}
+
+	ctx->ifc_sysctl_tx_reclaim_thresh = thresh;
+	if (ctx->ifc_txqs == NULL)
+		return (err);
+
+	txq = &ctx->ifc_txqs[0];
+	for (i = 0; i < NTXQSETS(ctx); i++, txq++) {
+		txq->ift_reclaim_thresh = thresh;
+	}
+	return (err);
+}
+
+static int
+iflib_handle_tx_reclaim_ticks(SYSCTL_HANDLER_ARGS)
+{
+	if_ctx_t ctx = (void *)arg1;
+	iflib_txq_t txq;
+	int i, err;
+	int ticks;
+
+	ticks = ctx->ifc_sysctl_tx_reclaim_ticks;
+	err = sysctl_handle_int(oidp, &ticks, arg2, req);
+	if (err != 0) {
+		return err;
+	}
+
+	if (ticks == ctx->ifc_sysctl_tx_reclaim_ticks)
+		return 0;
+
+	if (ticks > hz) {
+		device_printf(ctx->ifc_dev,
+		    "TX Reclaim ticks must be <= hz (%d)\n", hz);
+		return (EINVAL);
+	}
+
+	ctx->ifc_sysctl_tx_reclaim_ticks = ticks;
+	if (ctx->ifc_txqs == NULL)
+		return (err);
+
+	txq = &ctx->ifc_txqs[0];
+	for (i = 0; i < NTXQSETS(ctx); i++, txq++) {
+		txq->ift_reclaim_ticks = ticks;
+	}
+	return (err);
+}
+
 #define NAME_BUFLEN 32
 static void
 iflib_add_device_sysctl_pre(if_ctx_t ctx)
@@ -6862,6 +6940,16 @@ iflib_add_device_sysctl_post(if_ctx_t ctx)
 	node = ctx->ifc_sysctl_node;
 	child = SYSCTL_CHILDREN(node);
 
+       SYSCTL_ADD_PROC(ctx_list, child, OID_AUTO, "tx_reclaim_thresh",
+           CTLTYPE_INT | CTLFLAG_RWTUN, ctx,
+           0, iflib_handle_tx_reclaim_thresh, "I",
+           "Number of TX descs outstanding before reclaim is called");
+
+       SYSCTL_ADD_PROC(ctx_list, child, OID_AUTO, "tx_reclaim_ticks",
+           CTLTYPE_INT | CTLFLAG_RWTUN, ctx,
+           0, iflib_handle_tx_reclaim_ticks, "I",
+           "Number of ticks before a TX reclaim is forced");
+
 	if (scctx->isc_ntxqsets > 100)
 		qfmt = "txq%03d";
 	else if (scctx->isc_ntxqsets > 10)
@@ -7109,7 +7197,7 @@ iflib_debugnet_poll(if_t ifp, int count)
 		return (EBUSY);
 
 	txq = &ctx->ifc_txqs[0];
-	(void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
+	(void)iflib_completed_tx_reclaim(txq);
 
 	NET_EPOCH_ENTER(et);
 	for (i = 0; i < scctx->isc_nrxqsets; i++)
@@ -7159,7 +7247,7 @@ iflib_simple_transmit(if_t ifp, struct mbuf *m)
 		else
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 	}
-	(void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx));
+	(void)iflib_completed_tx_reclaim(txq);
 	mtx_unlock(&txq->ift_mtx);
 	if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent);
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent);
diff --git a/sys/net/iflib.h b/sys/net/iflib.h
index 3817445228d0..e65c936fc4b4 100644
--- a/sys/net/iflib.h
+++ b/sys/net/iflib.h
@@ -272,7 +272,7 @@ struct if_shared_ctx {
 	int isc_ntxqs;			/* # of tx queues per tx qset - usually 1 */
 	int isc_nrxqs;			/* # of rx queues per rx qset - intel 1, chelsio 2, broadcom 3 */
 	int __spare0__;
-	int isc_tx_reclaim_thresh;
+	int __spare1__;
 	int isc_flags;
 };
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202510012257.591MvCth033595>