Date: Wed, 1 Oct 2025 22:57:12 GMT From: Andrew Gallatin <gallatin@FreeBSD.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org Subject: git: 213170eb956f - main - iflib: Implement tx desc reclaim threshold Message-ID: <202510012257.591MvCth033595@gitrepo.freebsd.org>
next in thread | raw e-mail | index | archive | help
The branch main has been updated by gallatin: URL: https://cgit.FreeBSD.org/src/commit/?id=213170eb956fa7eec93c2ab4159d6ed68e8e1b1d commit 213170eb956fa7eec93c2ab4159d6ed68e8e1b1d Author: Andrew Gallatin <gallatin@FreeBSD.org> AuthorDate: 2025-10-01 22:43:05 +0000 Commit: Andrew Gallatin <gallatin@FreeBSD.org> CommitDate: 2025-10-01 22:55:00 +0000 iflib: Implement tx desc reclaim threshold On some iflib drivers, the txd reclaim routine can be fairly expensive at high packet rates. Iflib was designed with the intent of only reclaiming tx descriptors above a configurable threshold, but this logic was left unimplemented. This change: - implements 2 new knobs, iflib.tx_reclaim_thresh and iflib.tx_reclaim_ticks. - moves tx reclaim thresh from the if_shared_ctx and into the iflib_ctx as drivers don't need to see it, and it needs to be changed, so it can't be const - tx_reclaim_thresh and ticks are replicated into the txq to improve cache locality of data accessed in the hot path - ticks is used rather than more expensive timekeeping mechanism so as to keep things simple and cheap This change substantially improves packet rates on bnxt. It has been tested on bxnt and ixl Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D52561 Reviewed by: markj (initial version) --- sys/net/iflib.c | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++------ sys/net/iflib.h | 2 +- 2 files changed, 101 insertions(+), 13 deletions(-) diff --git a/sys/net/iflib.c b/sys/net/iflib.c index e2005aa28c5c..d2625da19cd2 100644 --- a/sys/net/iflib.c +++ b/sys/net/iflib.c @@ -202,6 +202,8 @@ struct iflib_ctx { uint16_t ifc_sysctl_extra_msix_vectors; bool ifc_cpus_are_physical_cores; bool ifc_sysctl_simple_tx; + uint16_t ifc_sysctl_tx_reclaim_thresh; + uint16_t ifc_sysctl_tx_reclaim_ticks; qidx_t ifc_sysctl_ntxds[8]; qidx_t ifc_sysctl_nrxds[8]; @@ -345,7 +347,9 @@ struct iflib_txq { uint16_t ift_npending; uint16_t ift_db_pending; uint16_t ift_rs_pending; - /* implicit pad */ + uint32_t ift_last_reclaim; + uint16_t ift_reclaim_thresh; + uint16_t ift_reclaim_ticks; uint8_t ift_txd_size[8]; uint64_t ift_processed; uint64_t ift_cleaned; @@ -729,7 +733,7 @@ static void iflib_free_intr_mem(if_ctx_t ctx); #ifndef __NO_STRICT_ALIGNMENT static struct mbuf *iflib_fixup_rx(struct mbuf *m); #endif -static __inline int iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh); +static __inline int iflib_completed_tx_reclaim(iflib_txq_t txq); static SLIST_HEAD(cpu_offset_list, cpu_offset) cpu_offsets = SLIST_HEAD_INITIALIZER(cpu_offsets); @@ -3084,8 +3088,6 @@ txq_max_rs_deferred(iflib_txq_t txq) #define QIDX(ctx, m) ((((m)->m_pkthdr.flowid & ctx->ifc_softc_ctx.isc_rss_table_mask) % NTXQSETS(ctx)) + FIRST_QSET(ctx)) #define DESC_RECLAIMABLE(q) ((int)((q)->ift_processed - (q)->ift_cleaned - (q)->ift_ctx->ifc_softc_ctx.isc_tx_nsegments)) -/* XXX we should be setting this to something other than zero */ -#define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh) #define MAX_TX_DESC(ctx) MAX((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max, \ (ctx)->ifc_softc_ctx.isc_tx_nsegments) @@ -3642,7 +3644,7 @@ defrag: * cxgb */ if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) { - (void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx)); + (void)iflib_completed_tx_reclaim(txq); if (__predict_false(nsegs + 2 > TXQ_AVAIL(txq))) { txq->ift_no_desc_avail++; bus_dmamap_unload(buf_tag, map); @@ -3785,14 +3787,21 @@ iflib_tx_desc_free(iflib_txq_t txq, int n) } static __inline int -iflib_completed_tx_reclaim(iflib_txq_t txq, int thresh) +iflib_completed_tx_reclaim(iflib_txq_t txq) { - int reclaim; + int reclaim, thresh; + uint32_t now; if_ctx_t ctx = txq->ift_ctx; + thresh = txq->ift_reclaim_thresh; KASSERT(thresh >= 0, ("invalid threshold to reclaim")); MPASS(thresh /*+ MAX_TX_DESC(txq->ift_ctx) */ < txq->ift_size); + now = ticks; + if (now <= (txq->ift_last_reclaim + txq->ift_reclaim_ticks) && + txq->ift_in_use < thresh) + return (0); + txq->ift_last_reclaim = now; /* * Need a rate-limiting check so that this isn't called every time */ @@ -3873,7 +3882,7 @@ iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx) DBG_COUNTER_INC(txq_drain_notready); return (0); } - reclaimed = iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx)); + reclaimed = iflib_completed_tx_reclaim(txq); rang = iflib_txd_db_check(txq, reclaimed && txq->ift_db_pending); avail = IDXDIFF(pidx, cidx, r->size); @@ -3952,7 +3961,7 @@ iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx) } /* deliberate use of bitwise or to avoid gratuitous short-circuit */ - ring = rang ? false : (iflib_min_tx_latency | err); + ring = rang ? false : (iflib_min_tx_latency | err | (!!txq->ift_reclaim_thresh)); iflib_txd_db_check(txq, ring); if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent); if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent); @@ -4032,7 +4041,7 @@ _task_fn_tx(void *context) #endif if (ctx->ifc_sysctl_simple_tx) { mtx_lock(&txq->ift_mtx); - (void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx)); + (void)iflib_completed_tx_reclaim(txq); mtx_unlock(&txq->ift_mtx); goto skip_ifmp; } @@ -5883,6 +5892,7 @@ iflib_queues_alloc(if_ctx_t ctx) device_printf(dev, "Unable to allocate buf_ring\n"); goto err_tx_desc; } + txq->ift_reclaim_thresh = ctx->ifc_sysctl_tx_reclaim_thresh; } for (rxconf = i = 0; i < nrxqsets; i++, rxconf++, rxq++) { @@ -6774,6 +6784,74 @@ mp_ndesc_handler(SYSCTL_HANDLER_ARGS) return (rc); } +static int +iflib_handle_tx_reclaim_thresh(SYSCTL_HANDLER_ARGS) +{ + if_ctx_t ctx = (void *)arg1; + iflib_txq_t txq; + int i, err; + int thresh; + + thresh = ctx->ifc_sysctl_tx_reclaim_thresh; + err = sysctl_handle_int(oidp, &thresh, arg2, req); + if (err != 0) { + return err; + } + + if (thresh == ctx->ifc_sysctl_tx_reclaim_thresh) + return 0; + + if (thresh > ctx->ifc_softc_ctx.isc_ntxd[0] / 2) { + device_printf(ctx->ifc_dev, "TX Reclaim thresh must be <= %d\n", + ctx->ifc_softc_ctx.isc_ntxd[0] / 2); + return (EINVAL); + } + + ctx->ifc_sysctl_tx_reclaim_thresh = thresh; + if (ctx->ifc_txqs == NULL) + return (err); + + txq = &ctx->ifc_txqs[0]; + for (i = 0; i < NTXQSETS(ctx); i++, txq++) { + txq->ift_reclaim_thresh = thresh; + } + return (err); +} + +static int +iflib_handle_tx_reclaim_ticks(SYSCTL_HANDLER_ARGS) +{ + if_ctx_t ctx = (void *)arg1; + iflib_txq_t txq; + int i, err; + int ticks; + + ticks = ctx->ifc_sysctl_tx_reclaim_ticks; + err = sysctl_handle_int(oidp, &ticks, arg2, req); + if (err != 0) { + return err; + } + + if (ticks == ctx->ifc_sysctl_tx_reclaim_ticks) + return 0; + + if (ticks > hz) { + device_printf(ctx->ifc_dev, + "TX Reclaim ticks must be <= hz (%d)\n", hz); + return (EINVAL); + } + + ctx->ifc_sysctl_tx_reclaim_ticks = ticks; + if (ctx->ifc_txqs == NULL) + return (err); + + txq = &ctx->ifc_txqs[0]; + for (i = 0; i < NTXQSETS(ctx); i++, txq++) { + txq->ift_reclaim_ticks = ticks; + } + return (err); +} + #define NAME_BUFLEN 32 static void iflib_add_device_sysctl_pre(if_ctx_t ctx) @@ -6862,6 +6940,16 @@ iflib_add_device_sysctl_post(if_ctx_t ctx) node = ctx->ifc_sysctl_node; child = SYSCTL_CHILDREN(node); + SYSCTL_ADD_PROC(ctx_list, child, OID_AUTO, "tx_reclaim_thresh", + CTLTYPE_INT | CTLFLAG_RWTUN, ctx, + 0, iflib_handle_tx_reclaim_thresh, "I", + "Number of TX descs outstanding before reclaim is called"); + + SYSCTL_ADD_PROC(ctx_list, child, OID_AUTO, "tx_reclaim_ticks", + CTLTYPE_INT | CTLFLAG_RWTUN, ctx, + 0, iflib_handle_tx_reclaim_ticks, "I", + "Number of ticks before a TX reclaim is forced"); + if (scctx->isc_ntxqsets > 100) qfmt = "txq%03d"; else if (scctx->isc_ntxqsets > 10) @@ -7109,7 +7197,7 @@ iflib_debugnet_poll(if_t ifp, int count) return (EBUSY); txq = &ctx->ifc_txqs[0]; - (void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx)); + (void)iflib_completed_tx_reclaim(txq); NET_EPOCH_ENTER(et); for (i = 0; i < scctx->isc_nrxqsets; i++) @@ -7159,7 +7247,7 @@ iflib_simple_transmit(if_t ifp, struct mbuf *m) else if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); } - (void)iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx)); + (void)iflib_completed_tx_reclaim(txq); mtx_unlock(&txq->ift_mtx); if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent); if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent); diff --git a/sys/net/iflib.h b/sys/net/iflib.h index 3817445228d0..e65c936fc4b4 100644 --- a/sys/net/iflib.h +++ b/sys/net/iflib.h @@ -272,7 +272,7 @@ struct if_shared_ctx { int isc_ntxqs; /* # of tx queues per tx qset - usually 1 */ int isc_nrxqs; /* # of rx queues per rx qset - intel 1, chelsio 2, broadcom 3 */ int __spare0__; - int isc_tx_reclaim_thresh; + int __spare1__; int isc_flags; };
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202510012257.591MvCth033595>