Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 24 Apr 2018 11:55:53 +0300
From:      Konstantin Belousov <kostikbel@gmail.com>
To:        current@freebsd.org
Cc:        net@freebsd.org, menny@mellanox.com
Subject:   mlx5(4) jumbo receive
Message-ID:  <20180424085553.GA6887@kib.kiev.ua>

next in thread | raw e-mail | index | archive | help
Hello,
the patch below is of interest for people who use Mellanox Connect-X/4
and 5 ethernet adapters and configure it for jumbo frames.  The patch
should improve the system behavior on low or fragmented memory situations.
See the commit message for detailed description.  Also, the patch should
not cause performance degradation for the normal 1500 MTU by keeping the
same configuration as currently unpatched driver.

I am interested in hearing about regressions (or not) caused by the
change.

commit f25c0c624177a1ca06ca051b0adb842acb66ec11
Author: Konstantin Belousov <konstantinb@mellanox.com>
Date:   Wed Nov 22 20:14:53 2017 +0200

    mlx5en: Handle jumbo frames without requiring big clusters.
    
    The scatter list is formed by the chunks of MCLBYTES each, and larger
    than default packets are returned to the stack as the mbuf chain.
    
    This is an improvements over the original patch by hanss@mellanox.com
    which solves some busdma issues and sizes WQEs scatter list to only
    have the minimal sufficient number of elements.  In particular, for
    the default MTU 1500 bytes the receive queue format is not changed
    comparing to the unpatched driver, avoiding a reported performance
    regression.
    
    Issue: 1065432
    Issue: 1184316
    Change-Id: Ib63a5ef55abd6ef1ec9b296e2cef88e4604bbd29
    Signed-off-by: Konstantin Belousov <konstantinb@mellanox.com>

diff --git a/sys/dev/mlx5/mlx5_en/en.h b/sys/dev/mlx5/mlx5_en/en.h
index b5000c32eaf..ff5ef446e34 100644
--- a/sys/dev/mlx5/mlx5_en/en.h
+++ b/sys/dev/mlx5/mlx5_en/en.h
@@ -80,8 +80,19 @@
 #define	MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE                0xa
 #define	MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE                0xe
 
-/* freeBSD HW LRO is limited by 16KB - the size of max mbuf */
+#define	MLX5E_MAX_RX_SEGS 7
+
+#ifndef MLX5E_MAX_RX_BYTES
+#define	MLX5E_MAX_RX_BYTES MCLBYTES
+#endif
+
+#if (MLX5E_MAX_RX_SEGS == 1)
+/* FreeBSD HW LRO is limited by 16KB - the size of max mbuf */
 #define	MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ                 MJUM16BYTES
+#else
+#define	MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ \
+    MIN(65535, MLX5E_MAX_RX_SEGS * MLX5E_MAX_RX_BYTES)
+#endif
 #define	MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC      0x10
 #define	MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE	0x3
 #define	MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS      0x20
@@ -530,6 +541,7 @@ struct mlx5e_rq {
 	struct mtx mtx;
 	bus_dma_tag_t dma_tag;
 	u32	wqe_sz;
+	u32	nsegs;
 	struct mlx5e_rq_mbuf *mbuf;
 	struct ifnet *ifp;
 	struct mlx5e_rq_stats stats;
@@ -795,9 +807,12 @@ struct mlx5e_tx_wqe {
 
 struct mlx5e_rx_wqe {
 	struct mlx5_wqe_srq_next_seg next;
-	struct mlx5_wqe_data_seg data;
+	struct mlx5_wqe_data_seg data[];
 };
 
+/* the size of the structure above must be power of two */
+CTASSERT(powerof2(sizeof(struct mlx5e_rx_wqe)));
+
 struct mlx5e_eeprom {
 	int	lock_bit;
 	int	i2c_addr;
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
index c2c4b0d7744..25544e561e3 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
@@ -33,9 +33,12 @@
 #ifndef ETH_DRIVER_VERSION
 #define	ETH_DRIVER_VERSION	"3.4.1"
 #endif
+
 char mlx5e_version[] = "Mellanox Ethernet driver"
     " (" ETH_DRIVER_VERSION ")";
 
+static int mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs);
+
 struct mlx5e_channel_param {
 	struct mlx5e_rq_param rq;
 	struct mlx5e_sq_param sq;
@@ -810,6 +813,11 @@ mlx5e_create_rq(struct mlx5e_channel *c,
 	int wq_sz;
 	int err;
 	int i;
+	u32 nsegs, wqe_sz;
+
+	err = mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs);
+	if (err != 0)
+		goto done;
 
 	/* Create DMA descriptor TAG */
 	if ((err = -bus_dma_tag_create(
@@ -819,9 +827,9 @@ mlx5e_create_rq(struct mlx5e_channel *c,
 	    BUS_SPACE_MAXADDR,		/* lowaddr */
 	    BUS_SPACE_MAXADDR,		/* highaddr */
 	    NULL, NULL,			/* filter, filterarg */
-	    MJUM16BYTES,		/* maxsize */
-	    1,				/* nsegments */
-	    MJUM16BYTES,		/* maxsegsize */
+	    nsegs * MLX5E_MAX_RX_BYTES,	/* maxsize */
+	    nsegs,			/* nsegments */
+	    nsegs * MLX5E_MAX_RX_BYTES,	/* maxsegsize */
 	    0,				/* flags */
 	    NULL, NULL,			/* lockfunc, lockfuncarg */
 	    &rq->dma_tag)))
@@ -834,23 +842,9 @@ mlx5e_create_rq(struct mlx5e_channel *c,
 
 	rq->wq.db = &rq->wq.db[MLX5_RCV_DBR];
 
-	if (priv->params.hw_lro_en) {
-		rq->wqe_sz = priv->params.lro_wqe_sz;
-	} else {
-		rq->wqe_sz = MLX5E_SW2MB_MTU(priv->ifp->if_mtu);
-	}
-	if (rq->wqe_sz > MJUM16BYTES) {
-		err = -ENOMEM;
+	err = mlx5e_get_wqe_sz(priv, &rq->wqe_sz, &rq->nsegs);
+	if (err != 0)
 		goto err_rq_wq_destroy;
-	} else if (rq->wqe_sz > MJUM9BYTES) {
-		rq->wqe_sz = MJUM16BYTES;
-	} else if (rq->wqe_sz > MJUMPAGESIZE) {
-		rq->wqe_sz = MJUM9BYTES;
-	} else if (rq->wqe_sz > MCLBYTES) {
-		rq->wqe_sz = MJUMPAGESIZE;
-	} else {
-		rq->wqe_sz = MCLBYTES;
-	}
 
 	wq_sz = mlx5_wq_ll_get_size(&rq->wq);
 
@@ -861,7 +855,11 @@ mlx5e_create_rq(struct mlx5e_channel *c,
 	rq->mbuf = malloc(wq_sz * sizeof(rq->mbuf[0]), M_MLX5EN, M_WAITOK | M_ZERO);
 	for (i = 0; i != wq_sz; i++) {
 		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
+#if (MLX5E_MAX_RX_SEGS == 1)
 		uint32_t byte_count = rq->wqe_sz - MLX5E_NET_IP_ALIGN;
+#else
+		int j;
+#endif
 
 		err = -bus_dmamap_create(rq->dma_tag, 0, &rq->mbuf[i].dma_map);
 		if (err != 0) {
@@ -869,8 +867,15 @@ mlx5e_create_rq(struct mlx5e_channel *c,
 				bus_dmamap_destroy(rq->dma_tag, rq->mbuf[i].dma_map);
 			goto err_rq_mbuf_free;
 		}
-		wqe->data.lkey = c->mkey_be;
-		wqe->data.byte_count = cpu_to_be32(byte_count | MLX5_HW_START_PADDING);
+
+		/* set value for constant fields */
+#if (MLX5E_MAX_RX_SEGS == 1)
+		wqe->data[0].lkey = c->mkey_be;
+		wqe->data[0].byte_count = cpu_to_be32(byte_count | MLX5_HW_START_PADDING);
+#else
+		for (j = 0; j < rq->nsegs; j++)
+			wqe->data[j].lkey = c->mkey_be;
+#endif
 	}
 
 	rq->ifp = c->ifp;
@@ -1809,16 +1814,51 @@ mlx5e_close_channel_wait(struct mlx5e_channel *volatile *pp)
 	free(c, M_MLX5EN);
 }
 
+static int
+mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs)
+{
+	u32 r, n;
+
+	r = priv->params.hw_lro_en ? priv->params.lro_wqe_sz :
+	    MLX5E_SW2MB_MTU(priv->ifp->if_mtu);
+	if (r > MJUM16BYTES)
+		return (-ENOMEM);
+
+	if (r > MJUM9BYTES)
+		r = MJUM16BYTES;
+	else if (r > MJUMPAGESIZE)
+		r = MJUM9BYTES;
+	else if (r > MCLBYTES)
+		r = MJUMPAGESIZE;
+	else
+		r = MCLBYTES;
+
+	/*
+	 * n + 1 must be a power of two, because stride size must be.
+	 * Stride size is 16 * (n + 1), as the first segment is
+	 * control.
+	 */
+	for (n = howmany(r, MLX5E_MAX_RX_BYTES); !powerof2(n + 1); n++)
+		;
+
+	*wqe_sz = r;
+	*nsegs = n;
+	return (0);
+}
+
 static void
 mlx5e_build_rq_param(struct mlx5e_priv *priv,
     struct mlx5e_rq_param *param)
 {
 	void *rqc = param->rqc;
 	void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
+	u32 wqe_sz, nsegs;
 
+	mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs);
 	MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST);
 	MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
-	MLX5_SET(wq, wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe)));
+	MLX5_SET(wq, wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe) +
+	    nsegs * sizeof(struct mlx5_wqe_data_seg)));
 	MLX5_SET(wq, wq, log_wq_sz, priv->params.log_rq_size);
 	MLX5_SET(wq, wq, pd, priv->pdn);
 
diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
index fb14be43b32..5868b0a2f55 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
@@ -32,21 +32,47 @@ static inline int
 mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq,
     struct mlx5e_rx_wqe *wqe, u16 ix)
 {
-	bus_dma_segment_t segs[1];
+	bus_dma_segment_t segs[rq->nsegs];
 	struct mbuf *mb;
 	int nsegs;
 	int err;
-
+#if (MLX5E_MAX_RX_SEGS != 1)
+	struct mbuf *mb_head;
+	int i;
+#endif
 	if (rq->mbuf[ix].mbuf != NULL)
 		return (0);
 
+#if (MLX5E_MAX_RX_SEGS == 1)
 	mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rq->wqe_sz);
 	if (unlikely(!mb))
 		return (-ENOMEM);
 
-	/* set initial mbuf length */
 	mb->m_pkthdr.len = mb->m_len = rq->wqe_sz;
+#else
+	mb_head = mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
+	    MLX5E_MAX_RX_BYTES);
+	if (unlikely(mb == NULL))
+		return (-ENOMEM);
+
+	mb->m_len = MLX5E_MAX_RX_BYTES;
+	mb->m_pkthdr.len = MLX5E_MAX_RX_BYTES;
 
+	for (i = 1; i < rq->nsegs; i++) {
+		if (mb_head->m_pkthdr.len >= rq->wqe_sz)
+			break;
+		mb = mb->m_next = m_getjcl(M_NOWAIT, MT_DATA, 0,
+		    MLX5E_MAX_RX_BYTES);
+		if (unlikely(mb == NULL)) {
+			m_freem(mb_head);
+			return (-ENOMEM);
+		}
+		mb->m_len = MLX5E_MAX_RX_BYTES;
+		mb_head->m_pkthdr.len += MLX5E_MAX_RX_BYTES;
+	}
+	/* rewind to first mbuf in chain */
+	mb = mb_head;
+#endif
 	/* get IP header aligned */
 	m_adj(mb, MLX5E_NET_IP_ALIGN);
 
@@ -54,12 +80,26 @@ mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq,
 	    mb, segs, &nsegs, BUS_DMA_NOWAIT);
 	if (err != 0)
 		goto err_free_mbuf;
-	if (unlikely(nsegs != 1)) {
+	if (unlikely(nsegs == 0)) {
 		bus_dmamap_unload(rq->dma_tag, rq->mbuf[ix].dma_map);
 		err = -ENOMEM;
 		goto err_free_mbuf;
 	}
-	wqe->data.addr = cpu_to_be64(segs[0].ds_addr);
+#if (MLX5E_MAX_RX_SEGS == 1)
+	wqe->data[0].addr = cpu_to_be64(segs[0].ds_addr);
+#else
+	wqe->data[0].addr = cpu_to_be64(segs[0].ds_addr);
+	wqe->data[0].byte_count = cpu_to_be32(segs[0].ds_len |
+	    MLX5_HW_START_PADDING);
+	for (i = 1; i != nsegs; i++) {
+		wqe->data[i].addr = cpu_to_be64(segs[i].ds_addr);
+		wqe->data[i].byte_count = cpu_to_be32(segs[i].ds_len);
+	}
+	for (; i < rq->nsegs; i++) {
+		wqe->data[i].addr = 0;
+		wqe->data[i].byte_count = 0;
+	}
+#endif
 
 	rq->mbuf[ix].mbuf = mb;
 	rq->mbuf[ix].data = mb->m_data;
@@ -214,6 +254,9 @@ mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe,
 {
 	struct ifnet *ifp = rq->ifp;
 	struct mlx5e_channel *c;
+#if (MLX5E_MAX_RX_SEGS != 1)
+	struct mbuf *mb_head;
+#endif
 	int lro_num_seg;	/* HW LRO session aggregated packets counter */
 	uint64_t tstmp;
 
@@ -224,7 +267,26 @@ mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe,
 		rq->stats.lro_bytes += cqe_bcnt;
 	}
 
+#if (MLX5E_MAX_RX_SEGS == 1)
 	mb->m_pkthdr.len = mb->m_len = cqe_bcnt;
+#else
+	mb->m_pkthdr.len = cqe_bcnt;
+	for (mb_head = mb; mb != NULL; mb = mb->m_next) {
+		if (mb->m_len > cqe_bcnt)
+			mb->m_len = cqe_bcnt;
+		cqe_bcnt -= mb->m_len;
+		if (likely(cqe_bcnt == 0)) {
+			if (likely(mb->m_next != NULL)) {
+				/* trim off empty mbufs */
+				m_freem(mb->m_next);
+				mb->m_next = NULL;
+			}
+			break;
+		}
+	}
+	/* rewind to first mbuf in chain */
+	mb = mb_head;
+#endif
 	/* check if a Toeplitz hash was computed */
 	if (cqe->rss_hash_type != 0) {
 		mb->m_pkthdr.flowid = be32_to_cpu(cqe->rss_hash_result);
@@ -447,7 +509,10 @@ mlx5e_rx_cq_comp(struct mlx5_core_cq *mcq)
 	int i = 0;
 
 #ifdef HAVE_PER_CQ_EVENT_PACKET
-	struct mbuf *mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rq->wqe_sz);
+#if (MHLEN < 15)
+#error "MHLEN is too small"
+#endif
+	struct mbuf *mb = m_gethdr(M_NOWAIT, MT_DATA);
 
 	if (mb != NULL) {
 		/* this code is used for debugging purpose only */



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20180424085553.GA6887>