Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 24 Feb 2025 14:04:24 GMT
From:      Wei Hu <whu@FreeBSD.org>
To:        src-committers@FreeBSD.org, dev-commits-src-all@FreeBSD.org, dev-commits-src-main@FreeBSD.org
Subject:   git: a18e99945414 - main - mana: Increase default tx and rx ring size to 1024
Message-ID:  <202502241404.51OE4Os9014808@gitrepo.freebsd.org>

next in thread | raw e-mail | index | archive | help
The branch main has been updated by whu:

URL: https://cgit.FreeBSD.org/src/commit/?id=a18e99945414fb1f9d455b780c6fcf2d09cc68d8

commit a18e99945414fb1f9d455b780c6fcf2d09cc68d8
Author:     Wei Hu <whu@FreeBSD.org>
AuthorDate: 2025-02-24 13:56:06 +0000
Commit:     Wei Hu <whu@FreeBSD.org>
CommitDate: 2025-02-24 13:56:06 +0000

    mana: Increase default tx and rx ring size to 1024
    
    Tcp perfomance tests show high number of retries under heave tx
    traffic. The numbers of queue stops and wakeups also increase.
    Further analysis suggests the FreeBSD network stack tends to send
    TSO packets with multiple sg entries, typically ranging from
    10 to 16. On mana, every two sgs takes one unit of tx ring.
    Therefore, adding up one unit for the head, it takes 6 to 9 units
    of tx ring to send a typical TSO packet.
    
    Current default tx ring size is 256, which can get filled up
    quickly under heavy load. When tx ring is full, the send queue
    is stopped waiting for the ring space to be freed. This could
    cause the network stack to drop packets, and lead to tcp
    retransmissions.
    
    Increase the default tx and rx ring size to 1024 units. Also
    introduce two tuneables allowing users to request tx and rx ring
    size in loader.conf:
            hw.mana.rx_req_size
            hw.mana.tx_req_size
    When mana is loading, the driver checks these two values and
    round them up to power of 2. If these two are not set or
    the request values are out of the allowable range, it sets the
    default ring size instead.
    
    Also change the tx and rx single loop completion budget to 8.
    
    Tested by:      whu
    MFC after:      2 weeks
    Sponsored by:   Microsoft
---
 sys/dev/mana/mana.h        | 23 +++++++++--
 sys/dev/mana/mana_en.c     | 96 +++++++++++++++++++++++++++++++++++++++-------
 sys/dev/mana/mana_sysctl.c | 16 ++++++++
 3 files changed, 119 insertions(+), 16 deletions(-)

diff --git a/sys/dev/mana/mana.h b/sys/dev/mana/mana.h
index 906b28eb56b6..a805aa047b9d 100644
--- a/sys/dev/mana/mana.h
+++ b/sys/dev/mana/mana.h
@@ -106,9 +106,23 @@ enum TRI_STATE {
 #define DEFAULT_FRAME_SIZE		(ADAPTER_MTU_SIZE + 14)
 #define MAX_FRAME_SIZE			4096
 
-#define RX_BUFFERS_PER_QUEUE		512
-
-#define MAX_SEND_BUFFERS_PER_QUEUE	256
+/* Unit number of RX buffers. Must be power of two
+ * Higher number could fail at allocation.
+ */
+#define MAX_RX_BUFFERS_PER_QUEUE	8192
+#define DEF_RX_BUFFERS_PER_QUEUE	1024
+#define MIN_RX_BUFFERS_PER_QUEUE	128
+
+/* Unit number of TX buffers. Must be power of two
+ * Higher number could fail at allocation.
+ * The max value is derived as the maximum
+ * allocatable pages supported on host per guest
+ * through testing. TX buffer size beyond this
+ * value is rejected by the hardware.
+ */
+#define MAX_SEND_BUFFERS_PER_QUEUE	16384
+#define DEF_SEND_BUFFERS_PER_QUEUE	1024
+#define MIN_SEND_BUFFERS_PER_QUEUE	128
 
 #define EQ_SIZE				(8 * PAGE_SIZE)
 #define LOG2_EQ_THROTTLE		3
@@ -507,6 +521,9 @@ struct mana_port_context {
 	unsigned int		max_queues;
 	unsigned int		num_queues;
 
+	unsigned int		tx_queue_size;
+	unsigned int		rx_queue_size;
+
 	mana_handle_t		port_handle;
 
 	int			vport_use_count;
diff --git a/sys/dev/mana/mana_en.c b/sys/dev/mana/mana_en.c
index 735b94bba6cd..a1d2d1015b89 100644
--- a/sys/dev/mana/mana_en.c
+++ b/sys/dev/mana/mana_en.c
@@ -67,6 +67,9 @@
 static int mana_up(struct mana_port_context *apc);
 static int mana_down(struct mana_port_context *apc);
 
+extern unsigned int mana_tx_req_size;
+extern unsigned int mana_rx_req_size;
+
 static void
 mana_rss_key_fill(void *k, size_t size)
 {
@@ -492,6 +495,7 @@ mana_xmit(struct mana_txq *txq)
 	if_t ndev = txq->ndev;
 	struct mbuf *mbuf;
 	struct mana_port_context *apc = if_getsoftc(ndev);
+	unsigned int tx_queue_size = apc->tx_queue_size;
 	struct mana_port_stats *port_stats = &apc->port_stats;
 	struct gdma_dev *gd = apc->ac->gdma_dev;
 	uint64_t packets, bytes;
@@ -635,7 +639,7 @@ mana_xmit(struct mana_txq *txq)
 		}
 
 		next_to_use =
-		    (next_to_use + 1) % MAX_SEND_BUFFERS_PER_QUEUE;
+		    (next_to_use + 1) % tx_queue_size;
 
 		(void)atomic_inc_return(&txq->pending_sends);
 
@@ -1423,6 +1427,7 @@ mana_poll_tx_cq(struct mana_cq *cq)
 	unsigned int wqe_unit_cnt = 0;
 	struct mana_txq *txq = cq->txq;
 	struct mana_port_context *apc;
+	unsigned int tx_queue_size;
 	uint16_t next_to_complete;
 	if_t ndev;
 	int comp_read;
@@ -1436,6 +1441,7 @@ mana_poll_tx_cq(struct mana_cq *cq)
 
 	ndev = txq->ndev;
 	apc = if_getsoftc(ndev);
+	tx_queue_size = apc->tx_queue_size;
 
 	comp_read = mana_gd_poll_cq(cq->gdma_cq, completions,
 	    CQE_POLLING_BUFFER);
@@ -1521,7 +1527,7 @@ mana_poll_tx_cq(struct mana_cq *cq)
 		mb();
 
 		next_to_complete =
-		    (next_to_complete + 1) % MAX_SEND_BUFFERS_PER_QUEUE;
+		    (next_to_complete + 1) % tx_queue_size;
 
 		pkt_transmitted++;
 	}
@@ -1867,9 +1873,9 @@ mana_cq_handler(void *context, struct gdma_queue *gdma_queue)
 	mana_gd_ring_cq(gdma_queue, arm_bit);
 }
 
-#define MANA_POLL_BUDGET	8
-#define MANA_RX_BUDGET		256
-#define MANA_TX_BUDGET		MAX_SEND_BUFFERS_PER_QUEUE
+#define MANA_POLL_BUDGET	256
+#define MANA_RX_BUDGET		8
+#define MANA_TX_BUDGET		8
 
 static void
 mana_poll(void *arg, int pending)
@@ -1976,7 +1982,7 @@ mana_deinit_txq(struct mana_port_context *apc, struct mana_txq *txq)
 
 	if (txq->tx_buf_info) {
 		/* Free all mbufs which are still in-flight */
-		for (i = 0; i < MAX_SEND_BUFFERS_PER_QUEUE; i++) {
+		for (i = 0; i < apc->tx_queue_size; i++) {
 			txbuf_info = &txq->tx_buf_info[i];
 			if (txbuf_info->mbuf) {
 				mana_tx_unmap_mbuf(apc, txbuf_info);
@@ -2034,15 +2040,19 @@ mana_create_txq(struct mana_port_context *apc, if_t net)
 	    M_DEVBUF, M_WAITOK | M_ZERO);
 
 	/*  The minimum size of the WQE is 32 bytes, hence
-	 *  MAX_SEND_BUFFERS_PER_QUEUE represents the maximum number of WQEs
+	 *  apc->tx_queue_size represents the maximum number of WQEs
 	 *  the SQ can store. This value is then used to size other queues
 	 *  to prevent overflow.
+	 *  Also note that the txq_size is always going to be page aligned,
+	 *  as min val of apc->tx_queue_size is 128 and that would make
+	 *  txq_size 128 * 32 = 4096 and the other higher values of
+	 *  apc->tx_queue_size are always power of two.
 	 */
-	txq_size = MAX_SEND_BUFFERS_PER_QUEUE * 32;
+	txq_size = apc->tx_queue_size * 32;
 	KASSERT(IS_ALIGNED(txq_size, PAGE_SIZE),
 	    ("txq size not page aligned"));
 
-	cq_size = MAX_SEND_BUFFERS_PER_QUEUE * COMP_ENTRY_SIZE;
+	cq_size = apc->tx_queue_size * COMP_ENTRY_SIZE;
 	cq_size = ALIGN(cq_size, PAGE_SIZE);
 
 	gc = gd->gdma_context;
@@ -2125,7 +2135,7 @@ mana_create_txq(struct mana_port_context *apc, if_t net)
 		gc->cq_table[cq->gdma_id] = cq->gdma_cq;
 
 		/* Initialize tx specific data */
-		txq->tx_buf_info = malloc(MAX_SEND_BUFFERS_PER_QUEUE *
+		txq->tx_buf_info = malloc(apc->tx_queue_size *
 		    sizeof(struct mana_send_buf_info),
 		    M_DEVBUF, M_WAITOK | M_ZERO);
 
@@ -2133,7 +2143,7 @@ mana_create_txq(struct mana_port_context *apc, if_t net)
 		    "mana:tx(%d)", i);
 		mtx_init(&txq->txq_mtx, txq->txq_mtx_name, NULL, MTX_DEF);
 
-		txq->txq_br = buf_ring_alloc(4 * MAX_SEND_BUFFERS_PER_QUEUE,
+		txq->txq_br = buf_ring_alloc(4 * apc->tx_queue_size,
 		    M_DEVBUF, M_WAITOK, &txq->txq_mtx);
 
 		/* Allocate taskqueue for deferred send */
@@ -2323,10 +2333,10 @@ mana_create_rxq(struct mana_port_context *apc, uint32_t rxq_idx,
 	gc = gd->gdma_context;
 
 	rxq = malloc(sizeof(*rxq) +
-	    RX_BUFFERS_PER_QUEUE * sizeof(struct mana_recv_buf_oob),
+	    apc->rx_queue_size * sizeof(struct mana_recv_buf_oob),
 	    M_DEVBUF, M_WAITOK | M_ZERO);
 	rxq->ndev = ndev;
-	rxq->num_rx_buf = RX_BUFFERS_PER_QUEUE;
+	rxq->num_rx_buf = apc->rx_queue_size;
 	rxq->rxq_idx = rxq_idx;
 	/*
 	 * Minimum size is MCLBYTES(2048) bytes for a mbuf cluster.
@@ -2763,6 +2773,62 @@ mana_detach(if_t ndev)
 	return err;
 }
 
+static unsigned int
+mana_get_tx_queue_size(int port_idx, unsigned int request_size)
+{
+	unsigned int new_size;
+
+	if (request_size == 0)
+		/* Uninitialized */
+		new_size = DEF_SEND_BUFFERS_PER_QUEUE;
+	else
+		new_size = roundup_pow_of_two(request_size);
+
+	if (new_size < MIN_SEND_BUFFERS_PER_QUEUE ||
+	    new_size > MAX_SEND_BUFFERS_PER_QUEUE) {
+		mana_info(NULL, "mana port %d: requested tx buffer "
+		    "size %u out of allowable range (%u - %u), "
+		    "setting to default\n",
+		    port_idx, request_size,
+		    MIN_SEND_BUFFERS_PER_QUEUE,
+		    MAX_SEND_BUFFERS_PER_QUEUE);
+		new_size = DEF_SEND_BUFFERS_PER_QUEUE;
+	}
+	mana_info(NULL, "mana port %d: tx buffer size %u "
+	    "(%u requested)\n",
+	    port_idx, new_size, request_size);
+
+	return (new_size);
+}
+
+static unsigned int
+mana_get_rx_queue_size(int port_idx, unsigned int request_size)
+{
+	unsigned int new_size;
+
+	if (request_size == 0)
+		/* Uninitialized */
+		new_size = DEF_RX_BUFFERS_PER_QUEUE;
+	else
+		new_size = roundup_pow_of_two(request_size);
+
+	if (new_size < MIN_RX_BUFFERS_PER_QUEUE ||
+	    new_size > MAX_RX_BUFFERS_PER_QUEUE) {
+		mana_info(NULL, "mana port %d: requested rx buffer "
+		    "size %u out of allowable range (%u - %u), "
+		    "setting to default\n",
+		    port_idx, request_size,
+		    MIN_RX_BUFFERS_PER_QUEUE,
+		    MAX_RX_BUFFERS_PER_QUEUE);
+		new_size = DEF_RX_BUFFERS_PER_QUEUE;
+	}
+	mana_info(NULL, "mana port %d: rx buffer size %u "
+	    "(%u requested)\n",
+	    port_idx, new_size, request_size);
+
+	return (new_size);
+}
+
 static int
 mana_probe_port(struct mana_context *ac, int port_idx,
     if_t *ndev_storage)
@@ -2782,6 +2848,10 @@ mana_probe_port(struct mana_context *ac, int port_idx,
 	apc->max_queues = gc->max_num_queues;
 	apc->num_queues = min_t(unsigned int,
 	    gc->max_num_queues, MANA_MAX_NUM_QUEUES);
+	apc->tx_queue_size = mana_get_tx_queue_size(port_idx,
+	    mana_tx_req_size);
+	apc->rx_queue_size = mana_get_rx_queue_size(port_idx,
+	    mana_rx_req_size);
 	apc->port_handle = INVALID_MANA_HANDLE;
 	apc->port_idx = port_idx;
 	apc->frame_size = DEFAULT_FRAME_SIZE;
diff --git a/sys/dev/mana/mana_sysctl.c b/sys/dev/mana/mana_sysctl.c
index 844a05040595..acb3628f09bc 100644
--- a/sys/dev/mana/mana_sysctl.c
+++ b/sys/dev/mana/mana_sysctl.c
@@ -34,9 +34,17 @@ static int mana_sysctl_cleanup_thread_cpu(SYSCTL_HANDLER_ARGS);
 
 int mana_log_level = MANA_ALERT | MANA_WARNING | MANA_INFO;
 
+unsigned int mana_tx_req_size;
+unsigned int mana_rx_req_size;
+
 SYSCTL_NODE(_hw, OID_AUTO, mana, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "MANA driver parameters");
 
+SYSCTL_UINT(_hw_mana, OID_AUTO, tx_req_size, CTLFLAG_RWTUN,
+    &mana_tx_req_size, 0, "requested number of unit of tx queue");
+SYSCTL_UINT(_hw_mana, OID_AUTO, rx_req_size, CTLFLAG_RWTUN,
+    &mana_rx_req_size, 0, "requested number of unit of rx queue");
+
 /*
  * Logging level for changing verbosity of the output
  */
@@ -166,6 +174,14 @@ mana_sysctl_add_port(struct mana_port_context *apc)
 	    "enable_altq", CTLFLAG_RW, &apc->enable_tx_altq, 0,
 	    "Choose alternative txq under heavy load");
 
+	SYSCTL_ADD_UINT(ctx, apc->port_list, OID_AUTO,
+	    "tx_queue_size", CTLFLAG_RD, &apc->tx_queue_size, 0,
+	    "number of unit of tx queue");
+
+	SYSCTL_ADD_UINT(ctx, apc->port_list, OID_AUTO,
+	    "rx_queue_size", CTLFLAG_RD, &apc->rx_queue_size, 0,
+	    "number of unit of rx queue");
+
 	SYSCTL_ADD_PROC(ctx, apc->port_list, OID_AUTO,
 	    "bind_cleanup_thread_cpu",
 	    CTLTYPE_U8 | CTLFLAG_RW | CTLFLAG_MPSAFE,



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?202502241404.51OE4Os9014808>