Date: Mon, 24 Feb 2025 14:04:24 GMT From: Wei Hu <whu@FreeBSD.org> To: src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org Subject: git: a18e99945414 - main - mana: Increase default tx and rx ring size to 1024 Message-ID: <202502241404.51OE4Os9014808@gitrepo.freebsd.org>
next in thread | raw e-mail | index | archive | help
The branch main has been updated by whu: URL: https://cgit.FreeBSD.org/src/commit/?id=a18e99945414fb1f9d455b780c6fcf2d09cc68d8 commit a18e99945414fb1f9d455b780c6fcf2d09cc68d8 Author: Wei Hu <whu@FreeBSD.org> AuthorDate: 2025-02-24 13:56:06 +0000 Commit: Wei Hu <whu@FreeBSD.org> CommitDate: 2025-02-24 13:56:06 +0000 mana: Increase default tx and rx ring size to 1024 Tcp perfomance tests show high number of retries under heave tx traffic. The numbers of queue stops and wakeups also increase. Further analysis suggests the FreeBSD network stack tends to send TSO packets with multiple sg entries, typically ranging from 10 to 16. On mana, every two sgs takes one unit of tx ring. Therefore, adding up one unit for the head, it takes 6 to 9 units of tx ring to send a typical TSO packet. Current default tx ring size is 256, which can get filled up quickly under heavy load. When tx ring is full, the send queue is stopped waiting for the ring space to be freed. This could cause the network stack to drop packets, and lead to tcp retransmissions. Increase the default tx and rx ring size to 1024 units. Also introduce two tuneables allowing users to request tx and rx ring size in loader.conf: hw.mana.rx_req_size hw.mana.tx_req_size When mana is loading, the driver checks these two values and round them up to power of 2. If these two are not set or the request values are out of the allowable range, it sets the default ring size instead. Also change the tx and rx single loop completion budget to 8. Tested by: whu MFC after: 2 weeks Sponsored by: Microsoft --- sys/dev/mana/mana.h | 23 +++++++++-- sys/dev/mana/mana_en.c | 96 +++++++++++++++++++++++++++++++++++++++------- sys/dev/mana/mana_sysctl.c | 16 ++++++++ 3 files changed, 119 insertions(+), 16 deletions(-) diff --git a/sys/dev/mana/mana.h b/sys/dev/mana/mana.h index 906b28eb56b6..a805aa047b9d 100644 --- a/sys/dev/mana/mana.h +++ b/sys/dev/mana/mana.h @@ -106,9 +106,23 @@ enum TRI_STATE { #define DEFAULT_FRAME_SIZE (ADAPTER_MTU_SIZE + 14) #define MAX_FRAME_SIZE 4096 -#define RX_BUFFERS_PER_QUEUE 512 - -#define MAX_SEND_BUFFERS_PER_QUEUE 256 +/* Unit number of RX buffers. Must be power of two + * Higher number could fail at allocation. + */ +#define MAX_RX_BUFFERS_PER_QUEUE 8192 +#define DEF_RX_BUFFERS_PER_QUEUE 1024 +#define MIN_RX_BUFFERS_PER_QUEUE 128 + +/* Unit number of TX buffers. Must be power of two + * Higher number could fail at allocation. + * The max value is derived as the maximum + * allocatable pages supported on host per guest + * through testing. TX buffer size beyond this + * value is rejected by the hardware. + */ +#define MAX_SEND_BUFFERS_PER_QUEUE 16384 +#define DEF_SEND_BUFFERS_PER_QUEUE 1024 +#define MIN_SEND_BUFFERS_PER_QUEUE 128 #define EQ_SIZE (8 * PAGE_SIZE) #define LOG2_EQ_THROTTLE 3 @@ -507,6 +521,9 @@ struct mana_port_context { unsigned int max_queues; unsigned int num_queues; + unsigned int tx_queue_size; + unsigned int rx_queue_size; + mana_handle_t port_handle; int vport_use_count; diff --git a/sys/dev/mana/mana_en.c b/sys/dev/mana/mana_en.c index 735b94bba6cd..a1d2d1015b89 100644 --- a/sys/dev/mana/mana_en.c +++ b/sys/dev/mana/mana_en.c @@ -67,6 +67,9 @@ static int mana_up(struct mana_port_context *apc); static int mana_down(struct mana_port_context *apc); +extern unsigned int mana_tx_req_size; +extern unsigned int mana_rx_req_size; + static void mana_rss_key_fill(void *k, size_t size) { @@ -492,6 +495,7 @@ mana_xmit(struct mana_txq *txq) if_t ndev = txq->ndev; struct mbuf *mbuf; struct mana_port_context *apc = if_getsoftc(ndev); + unsigned int tx_queue_size = apc->tx_queue_size; struct mana_port_stats *port_stats = &apc->port_stats; struct gdma_dev *gd = apc->ac->gdma_dev; uint64_t packets, bytes; @@ -635,7 +639,7 @@ mana_xmit(struct mana_txq *txq) } next_to_use = - (next_to_use + 1) % MAX_SEND_BUFFERS_PER_QUEUE; + (next_to_use + 1) % tx_queue_size; (void)atomic_inc_return(&txq->pending_sends); @@ -1423,6 +1427,7 @@ mana_poll_tx_cq(struct mana_cq *cq) unsigned int wqe_unit_cnt = 0; struct mana_txq *txq = cq->txq; struct mana_port_context *apc; + unsigned int tx_queue_size; uint16_t next_to_complete; if_t ndev; int comp_read; @@ -1436,6 +1441,7 @@ mana_poll_tx_cq(struct mana_cq *cq) ndev = txq->ndev; apc = if_getsoftc(ndev); + tx_queue_size = apc->tx_queue_size; comp_read = mana_gd_poll_cq(cq->gdma_cq, completions, CQE_POLLING_BUFFER); @@ -1521,7 +1527,7 @@ mana_poll_tx_cq(struct mana_cq *cq) mb(); next_to_complete = - (next_to_complete + 1) % MAX_SEND_BUFFERS_PER_QUEUE; + (next_to_complete + 1) % tx_queue_size; pkt_transmitted++; } @@ -1867,9 +1873,9 @@ mana_cq_handler(void *context, struct gdma_queue *gdma_queue) mana_gd_ring_cq(gdma_queue, arm_bit); } -#define MANA_POLL_BUDGET 8 -#define MANA_RX_BUDGET 256 -#define MANA_TX_BUDGET MAX_SEND_BUFFERS_PER_QUEUE +#define MANA_POLL_BUDGET 256 +#define MANA_RX_BUDGET 8 +#define MANA_TX_BUDGET 8 static void mana_poll(void *arg, int pending) @@ -1976,7 +1982,7 @@ mana_deinit_txq(struct mana_port_context *apc, struct mana_txq *txq) if (txq->tx_buf_info) { /* Free all mbufs which are still in-flight */ - for (i = 0; i < MAX_SEND_BUFFERS_PER_QUEUE; i++) { + for (i = 0; i < apc->tx_queue_size; i++) { txbuf_info = &txq->tx_buf_info[i]; if (txbuf_info->mbuf) { mana_tx_unmap_mbuf(apc, txbuf_info); @@ -2034,15 +2040,19 @@ mana_create_txq(struct mana_port_context *apc, if_t net) M_DEVBUF, M_WAITOK | M_ZERO); /* The minimum size of the WQE is 32 bytes, hence - * MAX_SEND_BUFFERS_PER_QUEUE represents the maximum number of WQEs + * apc->tx_queue_size represents the maximum number of WQEs * the SQ can store. This value is then used to size other queues * to prevent overflow. + * Also note that the txq_size is always going to be page aligned, + * as min val of apc->tx_queue_size is 128 and that would make + * txq_size 128 * 32 = 4096 and the other higher values of + * apc->tx_queue_size are always power of two. */ - txq_size = MAX_SEND_BUFFERS_PER_QUEUE * 32; + txq_size = apc->tx_queue_size * 32; KASSERT(IS_ALIGNED(txq_size, PAGE_SIZE), ("txq size not page aligned")); - cq_size = MAX_SEND_BUFFERS_PER_QUEUE * COMP_ENTRY_SIZE; + cq_size = apc->tx_queue_size * COMP_ENTRY_SIZE; cq_size = ALIGN(cq_size, PAGE_SIZE); gc = gd->gdma_context; @@ -2125,7 +2135,7 @@ mana_create_txq(struct mana_port_context *apc, if_t net) gc->cq_table[cq->gdma_id] = cq->gdma_cq; /* Initialize tx specific data */ - txq->tx_buf_info = malloc(MAX_SEND_BUFFERS_PER_QUEUE * + txq->tx_buf_info = malloc(apc->tx_queue_size * sizeof(struct mana_send_buf_info), M_DEVBUF, M_WAITOK | M_ZERO); @@ -2133,7 +2143,7 @@ mana_create_txq(struct mana_port_context *apc, if_t net) "mana:tx(%d)", i); mtx_init(&txq->txq_mtx, txq->txq_mtx_name, NULL, MTX_DEF); - txq->txq_br = buf_ring_alloc(4 * MAX_SEND_BUFFERS_PER_QUEUE, + txq->txq_br = buf_ring_alloc(4 * apc->tx_queue_size, M_DEVBUF, M_WAITOK, &txq->txq_mtx); /* Allocate taskqueue for deferred send */ @@ -2323,10 +2333,10 @@ mana_create_rxq(struct mana_port_context *apc, uint32_t rxq_idx, gc = gd->gdma_context; rxq = malloc(sizeof(*rxq) + - RX_BUFFERS_PER_QUEUE * sizeof(struct mana_recv_buf_oob), + apc->rx_queue_size * sizeof(struct mana_recv_buf_oob), M_DEVBUF, M_WAITOK | M_ZERO); rxq->ndev = ndev; - rxq->num_rx_buf = RX_BUFFERS_PER_QUEUE; + rxq->num_rx_buf = apc->rx_queue_size; rxq->rxq_idx = rxq_idx; /* * Minimum size is MCLBYTES(2048) bytes for a mbuf cluster. @@ -2763,6 +2773,62 @@ mana_detach(if_t ndev) return err; } +static unsigned int +mana_get_tx_queue_size(int port_idx, unsigned int request_size) +{ + unsigned int new_size; + + if (request_size == 0) + /* Uninitialized */ + new_size = DEF_SEND_BUFFERS_PER_QUEUE; + else + new_size = roundup_pow_of_two(request_size); + + if (new_size < MIN_SEND_BUFFERS_PER_QUEUE || + new_size > MAX_SEND_BUFFERS_PER_QUEUE) { + mana_info(NULL, "mana port %d: requested tx buffer " + "size %u out of allowable range (%u - %u), " + "setting to default\n", + port_idx, request_size, + MIN_SEND_BUFFERS_PER_QUEUE, + MAX_SEND_BUFFERS_PER_QUEUE); + new_size = DEF_SEND_BUFFERS_PER_QUEUE; + } + mana_info(NULL, "mana port %d: tx buffer size %u " + "(%u requested)\n", + port_idx, new_size, request_size); + + return (new_size); +} + +static unsigned int +mana_get_rx_queue_size(int port_idx, unsigned int request_size) +{ + unsigned int new_size; + + if (request_size == 0) + /* Uninitialized */ + new_size = DEF_RX_BUFFERS_PER_QUEUE; + else + new_size = roundup_pow_of_two(request_size); + + if (new_size < MIN_RX_BUFFERS_PER_QUEUE || + new_size > MAX_RX_BUFFERS_PER_QUEUE) { + mana_info(NULL, "mana port %d: requested rx buffer " + "size %u out of allowable range (%u - %u), " + "setting to default\n", + port_idx, request_size, + MIN_RX_BUFFERS_PER_QUEUE, + MAX_RX_BUFFERS_PER_QUEUE); + new_size = DEF_RX_BUFFERS_PER_QUEUE; + } + mana_info(NULL, "mana port %d: rx buffer size %u " + "(%u requested)\n", + port_idx, new_size, request_size); + + return (new_size); +} + static int mana_probe_port(struct mana_context *ac, int port_idx, if_t *ndev_storage) @@ -2782,6 +2848,10 @@ mana_probe_port(struct mana_context *ac, int port_idx, apc->max_queues = gc->max_num_queues; apc->num_queues = min_t(unsigned int, gc->max_num_queues, MANA_MAX_NUM_QUEUES); + apc->tx_queue_size = mana_get_tx_queue_size(port_idx, + mana_tx_req_size); + apc->rx_queue_size = mana_get_rx_queue_size(port_idx, + mana_rx_req_size); apc->port_handle = INVALID_MANA_HANDLE; apc->port_idx = port_idx; apc->frame_size = DEFAULT_FRAME_SIZE; diff --git a/sys/dev/mana/mana_sysctl.c b/sys/dev/mana/mana_sysctl.c index 844a05040595..acb3628f09bc 100644 --- a/sys/dev/mana/mana_sysctl.c +++ b/sys/dev/mana/mana_sysctl.c @@ -34,9 +34,17 @@ static int mana_sysctl_cleanup_thread_cpu(SYSCTL_HANDLER_ARGS); int mana_log_level = MANA_ALERT | MANA_WARNING | MANA_INFO; +unsigned int mana_tx_req_size; +unsigned int mana_rx_req_size; + SYSCTL_NODE(_hw, OID_AUTO, mana, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, "MANA driver parameters"); +SYSCTL_UINT(_hw_mana, OID_AUTO, tx_req_size, CTLFLAG_RWTUN, + &mana_tx_req_size, 0, "requested number of unit of tx queue"); +SYSCTL_UINT(_hw_mana, OID_AUTO, rx_req_size, CTLFLAG_RWTUN, + &mana_rx_req_size, 0, "requested number of unit of rx queue"); + /* * Logging level for changing verbosity of the output */ @@ -166,6 +174,14 @@ mana_sysctl_add_port(struct mana_port_context *apc) "enable_altq", CTLFLAG_RW, &apc->enable_tx_altq, 0, "Choose alternative txq under heavy load"); + SYSCTL_ADD_UINT(ctx, apc->port_list, OID_AUTO, + "tx_queue_size", CTLFLAG_RD, &apc->tx_queue_size, 0, + "number of unit of tx queue"); + + SYSCTL_ADD_UINT(ctx, apc->port_list, OID_AUTO, + "rx_queue_size", CTLFLAG_RD, &apc->rx_queue_size, 0, + "number of unit of rx queue"); + SYSCTL_ADD_PROC(ctx, apc->port_list, OID_AUTO, "bind_cleanup_thread_cpu", CTLTYPE_U8 | CTLFLAG_RW | CTLFLAG_MPSAFE,
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202502241404.51OE4Os9014808>