Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 20 May 2016 06:54:58 +0000 (UTC)
From:      Hans Petter Selasky <hselasky@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r300277 - head/sys/dev/mlx5/mlx5_en
Message-ID:  <201605200654.u4K6swd0090981@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: hselasky
Date: Fri May 20 06:54:58 2016
New Revision: 300277
URL: https://svnweb.freebsd.org/changeset/base/300277

Log:
  Implement TX completion event interleaving.
  
  This patch implements a sysctl which allows setting a factor, N, for
  how many work queue elements can be generated before requiring a
  completion event. When a completion event happens the code simulates N
  completion events instead of only one. When draining a transmit queue,
  N-1 NOPs are transmitted at most, to force generation of the final
  completion event.  Further a timer is running every HZ ticks to flush
  any remaining data off the transmit queue when the tx_completion_fact
  > 1.
  
  The goal of this feature is to reduce the PCI bandwidth needed when
  transmitting data.
  
  Sponsored by:	Mellanox Technologies
  Tested by:	Netflix
  MFC after:	1 week

Modified:
  head/sys/dev/mlx5/mlx5_en/en.h
  head/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c
  head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
  head/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c

Modified: head/sys/dev/mlx5/mlx5_en/en.h
==============================================================================
--- head/sys/dev/mlx5/mlx5_en/en.h	Fri May 20 06:47:42 2016	(r300276)
+++ head/sys/dev/mlx5/mlx5_en/en.h	Fri May 20 06:54:58 2016	(r300277)
@@ -391,6 +391,8 @@ struct mlx5e_params {
   m(+1, u64 tx_coalesce_usecs, "tx_coalesce_usecs", "Limit in usec for joining tx packets") \
   m(+1, u64 tx_coalesce_pkts, "tx_coalesce_pkts", "Maximum number of tx packets to join") \
   m(+1, u64 tx_coalesce_mode, "tx_coalesce_mode", "0: EQE mode 1: CQE mode") \
+  m(+1, u64 tx_completion_fact, "tx_completion_fact", "1..MAX: Completion event ratio") \
+  m(+1, u64 tx_completion_fact_max, "tx_completion_fact_max", "Maximum completion event ratio") \
   m(+1, u64 hw_lro, "hw_lro", "set to enable hw_lro") \
   m(+1, u64 cqe_zipping, "cqe_zipping", "0 : CQE zipping disabled")
 
@@ -496,6 +498,13 @@ struct mlx5e_sq {
 	/* dirtied @xmit */
 	u16	pc __aligned(MLX5E_CACHELINE_SIZE);
 	u16	bf_offset;
+	u16	cev_counter;		/* completion event counter */
+	u16	cev_factor;		/* completion event factor */
+	u32	cev_next_state;		/* next completion event state */
+#define	MLX5E_CEV_STATE_INITIAL 0	/* timer not started */
+#define	MLX5E_CEV_STATE_SEND_NOPS 1	/* send NOPs */
+#define	MLX5E_CEV_STATE_HOLD_NOPS 2	/* don't send NOPs yet */
+	struct callout cev_callout;
 	struct	mlx5e_sq_stats stats;
 
 	struct	mlx5e_cq cq;
@@ -787,6 +796,7 @@ void	mlx5e_create_stats(struct sysctl_ct
     struct sysctl_oid_list *, const char *,
     const char **, unsigned, u64 *);
 void	mlx5e_send_nop(struct mlx5e_sq *, u32, bool);
+void	mlx5e_sq_cev_timeout(void *);
 int	mlx5e_refresh_channel_params(struct mlx5e_priv *);
 
 #endif					/* _MLX5_EN_H_ */

Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c
==============================================================================
--- head/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c	Fri May 20 06:47:42 2016	(r300276)
+++ head/sys/dev/mlx5/mlx5_en/mlx5_en_ethtool.c	Fri May 20 06:54:58 2016	(r300277)
@@ -48,6 +48,42 @@ mlx5e_create_stats(struct sysctl_ctx_lis
 	}
 }
 
+static void
+mlx5e_ethtool_sync_tx_completion_fact(struct mlx5e_priv *priv)
+{
+	/*
+	 * Limit the maximum distance between completion events to
+	 * half of the currently set TX queue size.
+	 *
+	 * The maximum number of queue entries a single IP packet can
+	 * consume is given by MLX5_SEND_WQE_MAX_WQEBBS.
+	 *
+	 * The worst case max value is then given as below:
+	 */
+	uint64_t max = priv->params_ethtool.tx_queue_size /
+	    (2 * MLX5_SEND_WQE_MAX_WQEBBS);
+
+	/*
+	 * Update the maximum completion factor value in case the
+	 * tx_queue_size field changed. Ensure we don't overflow
+	 * 16-bits.
+	 */
+	if (max < 1)
+		max = 1;
+	else if (max > 65535)
+		max = 65535;
+	priv->params_ethtool.tx_completion_fact_max = max;
+
+	/*
+	 * Verify that the current TX completion factor is within the
+	 * given limits:
+	 */
+	if (priv->params_ethtool.tx_completion_fact < 1)
+		priv->params_ethtool.tx_completion_fact = 1;
+	else if (priv->params_ethtool.tx_completion_fact > max)
+		priv->params_ethtool.tx_completion_fact = max;
+}
+
 static int
 mlx5e_ethtool_handler(SYSCTL_HANDLER_ARGS)
 {
@@ -206,6 +242,14 @@ mlx5e_ethtool_handler(SYSCTL_HANDLER_ARG
 			priv->params_ethtool.cqe_zipping = 0;
 		}
 	}
+
+	if (&priv->params_ethtool.arg[arg2] ==
+	    &priv->params_ethtool.tx_completion_fact ||
+	    &priv->params_ethtool.arg[arg2] ==
+	    &priv->params_ethtool.tx_queue_size) {
+		/* verify parameter */
+		mlx5e_ethtool_sync_tx_completion_fact(priv);
+	}
 	if (was_opened)
 		mlx5e_open_locked(priv->ifp);
 done:
@@ -475,6 +519,7 @@ mlx5e_create_ethtool(struct mlx5e_priv *
 	priv->params_ethtool.tx_coalesce_pkts = priv->params.tx_cq_moderation_pkts;
 	priv->params_ethtool.hw_lro = priv->params.hw_lro_en;
 	priv->params_ethtool.cqe_zipping = priv->params.cqe_zipping_en;
+	mlx5e_ethtool_sync_tx_completion_fact(priv);
 
 	/* create root node */
 	node = SYSCTL_ADD_NODE(&priv->sysctl_ctx,

Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
==============================================================================
--- head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c	Fri May 20 06:47:42 2016	(r300276)
+++ head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c	Fri May 20 06:54:58 2016	(r300277)
@@ -1185,24 +1185,82 @@ err_destroy_sq:
 }
 
 static void
-mlx5e_close_sq(struct mlx5e_sq *sq)
+mlx5e_sq_send_nops_locked(struct mlx5e_sq *sq, int can_sleep)
 {
-
-	/* ensure hw is notified of all pending wqes */
-	if (mlx5e_sq_has_room_for(sq, 1))
+	/* fill up remainder with NOPs */
+	while (sq->cev_counter != 0) {
+		while (!mlx5e_sq_has_room_for(sq, 1)) {
+			if (can_sleep != 0) {
+				mtx_unlock(&sq->lock);
+				msleep(4);
+				mtx_lock(&sq->lock);
+			} else {
+				goto done;
+			}
+		}
 		mlx5e_send_nop(sq, 1, true);
+	}
+done:
+	return;
+}
 
-	mlx5e_modify_sq(sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR);
+void
+mlx5e_sq_cev_timeout(void *arg)
+{
+	struct mlx5e_sq *sq = arg;
+
+	mtx_assert(&sq->lock, MA_OWNED);
+
+	/* check next state */
+	switch (sq->cev_next_state) {
+	case MLX5E_CEV_STATE_SEND_NOPS:
+		/* fill TX ring with NOPs, if any */
+		mlx5e_sq_send_nops_locked(sq, 0);
+
+		/* check if completed */
+		if (sq->cev_counter == 0) {
+			sq->cev_next_state = MLX5E_CEV_STATE_INITIAL;
+			return;
+		}
+		break;
+	default:
+		/* send NOPs on next timeout */
+		sq->cev_next_state = MLX5E_CEV_STATE_SEND_NOPS;
+		break;
+	}
+
+	/* restart timer */
+	callout_reset_curcpu(&sq->cev_callout, hz, mlx5e_sq_cev_timeout, sq);
 }
 
 static void
 mlx5e_close_sq_wait(struct mlx5e_sq *sq)
 {
+
+	mtx_lock(&sq->lock);
+	/* teardown event factor timer, if any */
+	sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS;
+	callout_stop(&sq->cev_callout);
+
+	/* send dummy NOPs in order to flush the transmit ring */
+	mlx5e_sq_send_nops_locked(sq, 1);
+	mtx_unlock(&sq->lock);
+
+	/* make sure it is safe to free the callout */
+	callout_drain(&sq->cev_callout);
+
+	/* error out remaining requests */
+	mlx5e_modify_sq(sq, MLX5_SQC_STATE_RDY, MLX5_SQC_STATE_ERR);
+
 	/* wait till SQ is empty */
+	mtx_lock(&sq->lock);
 	while (sq->cc != sq->pc) {
+		mtx_unlock(&sq->lock);
 		msleep(4);
 		sq->cq.mcq.comp(&sq->cq.mcq);
+		mtx_lock(&sq->lock);
 	}
+	mtx_unlock(&sq->lock);
 
 	mlx5e_disable_sq(sq);
 	mlx5e_destroy_sq(sq);
@@ -1412,24 +1470,13 @@ mlx5e_open_sqs(struct mlx5e_channel *c,
 	return (0);
 
 err_close_sqs:
-	for (tc--; tc >= 0; tc--) {
-		mlx5e_close_sq(&c->sq[tc]);
+	for (tc--; tc >= 0; tc--)
 		mlx5e_close_sq_wait(&c->sq[tc]);
-	}
 
 	return (err);
 }
 
 static void
-mlx5e_close_sqs(struct mlx5e_channel *c)
-{
-	int tc;
-
-	for (tc = 0; tc < c->num_tc; tc++)
-		mlx5e_close_sq(&c->sq[tc]);
-}
-
-static void
 mlx5e_close_sqs_wait(struct mlx5e_channel *c)
 {
 	int tc;
@@ -1446,9 +1493,19 @@ mlx5e_chan_mtx_init(struct mlx5e_channel
 	mtx_init(&c->rq.mtx, "mlx5rx", MTX_NETWORK_LOCK, MTX_DEF);
 
 	for (tc = 0; tc < c->num_tc; tc++) {
-		mtx_init(&c->sq[tc].lock, "mlx5tx", MTX_NETWORK_LOCK, MTX_DEF);
-		mtx_init(&c->sq[tc].comp_lock, "mlx5comp", MTX_NETWORK_LOCK,
+		struct mlx5e_sq *sq = c->sq + tc;
+
+		mtx_init(&sq->lock, "mlx5tx", MTX_NETWORK_LOCK, MTX_DEF);
+		mtx_init(&sq->comp_lock, "mlx5comp", MTX_NETWORK_LOCK,
 		    MTX_DEF);
+
+		callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
+
+		sq->cev_factor = c->priv->params_ethtool.tx_completion_fact;
+
+		/* ensure the TX completion event factor is not zero */
+		if (sq->cev_factor == 0)
+			sq->cev_factor = 1;
 	}
 }
 
@@ -1529,7 +1586,6 @@ mlx5e_open_channel(struct mlx5e_priv *pr
 	return (0);
 
 err_close_sqs:
-	mlx5e_close_sqs(c);
 	mlx5e_close_sqs_wait(c);
 
 err_close_rx_cq:
@@ -1554,7 +1610,6 @@ mlx5e_close_channel(struct mlx5e_channel
 	if (c == NULL)
 		return;
 	mlx5e_close_rq(&c->rq);
-	mlx5e_close_sqs(c);
 }
 
 static void

Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c
==============================================================================
--- head/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c	Fri May 20 06:47:42 2016	(r300276)
+++ head/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c	Fri May 20 06:54:58 2016	(r300277)
@@ -28,6 +28,18 @@
 #include "en.h"
 #include <machine/atomic.h>
 
+static inline bool
+mlx5e_do_send_cqe(struct mlx5e_sq *sq)
+{
+	sq->cev_counter++;
+	/* interleave the CQEs */
+	if (sq->cev_counter >= sq->cev_factor) {
+		sq->cev_counter = 0;
+		return (1);
+	}
+	return (0);
+}
+
 void
 mlx5e_send_nop(struct mlx5e_sq *sq, u32 ds_cnt, bool notify_hw)
 {
@@ -38,7 +50,10 @@ mlx5e_send_nop(struct mlx5e_sq *sq, u32 
 
 	wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | MLX5_OPCODE_NOP);
 	wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
-	wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
+	if (mlx5e_do_send_cqe(sq))
+		wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
+	else
+		wqe->ctrl.fm_ce_se = 0;
 
 	sq->mbuf[pi].mbuf = NULL;
 	sq->mbuf[pi].num_bytes = 0;
@@ -340,7 +355,10 @@ skip_dma:
 
 	wqe->ctrl.opmod_idx_opcode = cpu_to_be32((sq->pc << 8) | opcode);
 	wqe->ctrl.qpn_ds = cpu_to_be32((sq->sqn << 8) | ds_cnt);
-	wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
+	if (mlx5e_do_send_cqe(sq))
+		wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE;
+	else
+		wqe->ctrl.fm_ce_se = 0;
 
 	/* Store pointer to mbuf */
 	sq->mbuf[pi].mbuf = mb;
@@ -374,9 +392,10 @@ mlx5e_poll_tx_cq(struct mlx5e_sq *sq, in
 	 */
 	sqcc = sq->cc;
 
-	while (budget--) {
+	while (budget > 0) {
 		struct mlx5_cqe64 *cqe;
 		struct mbuf *mb;
+		u16 x;
 		u16 ci;
 
 		cqe = mlx5e_get_cqe(&sq->cq);
@@ -385,24 +404,29 @@ mlx5e_poll_tx_cq(struct mlx5e_sq *sq, in
 
 		mlx5_cqwq_pop(&sq->cq.wq);
 
-		ci = sqcc & sq->wq.sz_m1;
-		mb = sq->mbuf[ci].mbuf;
-		sq->mbuf[ci].mbuf = NULL;	/* Safety clear */
+		/* update budget according to the event factor */
+		budget -= sq->cev_factor;
 
-		if (mb == NULL) {
-			if (sq->mbuf[ci].num_bytes == 0) {
-				/* NOP */
-				sq->stats.nop++;
-			}
-		} else {
-			bus_dmamap_sync(sq->dma_tag, sq->mbuf[ci].dma_map,
-			    BUS_DMASYNC_POSTWRITE);
-			bus_dmamap_unload(sq->dma_tag, sq->mbuf[ci].dma_map);
+		for (x = 0; x != sq->cev_factor; x++) {
+			ci = sqcc & sq->wq.sz_m1;
+			mb = sq->mbuf[ci].mbuf;
+			sq->mbuf[ci].mbuf = NULL;	/* Safety clear */
+
+			if (mb == NULL) {
+				if (sq->mbuf[ci].num_bytes == 0) {
+					/* NOP */
+					sq->stats.nop++;
+				}
+			} else {
+				bus_dmamap_sync(sq->dma_tag, sq->mbuf[ci].dma_map,
+				    BUS_DMASYNC_POSTWRITE);
+				bus_dmamap_unload(sq->dma_tag, sq->mbuf[ci].dma_map);
 
-			/* Free transmitted mbuf */
-			m_freem(mb);
+				/* Free transmitted mbuf */
+				m_freem(mb);
+			}
+			sqcc += sq->mbuf[ci].num_wqebbs;
 		}
-		sqcc += sq->mbuf[ci].num_wqebbs;
 	}
 
 	mlx5_cqwq_update_db_record(&sq->cq.wq);
@@ -450,6 +474,18 @@ mlx5e_xmit_locked(struct ifnet *ifp, str
 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
 			break;
 	}
+	/*
+	 * Check if we need to start the event timer which flushes the
+	 * transmit ring on timeout:
+	 */
+	if (unlikely(sq->cev_next_state == MLX5E_CEV_STATE_INITIAL &&
+	    sq->cev_factor != 1)) {
+		/* start the timer */
+		mlx5e_sq_cev_timeout(sq);
+	} else {
+		/* don't send NOPs yet */
+		sq->cev_next_state = MLX5E_CEV_STATE_HOLD_NOPS;
+	}
 	return (err);
 }
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201605200654.u4K6swd0090981>