Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 29 Nov 2017 10:04:11 +0000 (UTC)
From:      Konstantin Belousov <kib@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r326363 - in head/sys/dev/mlx5: . mlx5_en
Message-ID:  <201711291004.vATA4BBu008365@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: kib
Date: Wed Nov 29 10:04:11 2017
New Revision: 326363
URL: https://svnweb.freebsd.org/changeset/base/326363

Log:
  Implement hardware mlx5(4) rx timestamps.
  
  Driver support is only provided for ConnectX4/5.
  
  System-time timestamp is calculated based on the free-running counter
  timestamp provided by hardware.  Driver periodically samples the
  counter to calibrate it against the system clock and uses linear
  interpolation to convert.  Stability of the crystal which drives the
  clock is +-50 ppm at the operational temperature, which makes the
  algorithm good enough.
  
  The calculation is somewhat delicate because all values are 64bit and
  overflow the naive formula for linear interpolation.  The calculation
  drops the least significant bits in advance, see the PREC shift in
  mlx5_mbuf_tstmp().
  
  Hardware stamps can be turned off by 'ifconfig mceN -hwrxtsmp'.  Buggy
  firmware might result in small but visible errors in the reported
  timestamps, detectable e.g. by nonsensical (negative) RTT values for
  LAN pings.
  
  Reviewed by:	gallatin, hselasky
  Sponsored by:	Mellanox Technologies
  Differential revision:	https://reviews.freebsd.org/D12638

Modified:
  head/sys/dev/mlx5/device.h
  head/sys/dev/mlx5/mlx5_en/en.h
  head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
  head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c

Modified: head/sys/dev/mlx5/device.h
==============================================================================
--- head/sys/dev/mlx5/device.h	Wed Nov 29 09:40:11 2017	(r326362)
+++ head/sys/dev/mlx5/device.h	Wed Nov 29 10:04:11 2017	(r326363)
@@ -619,6 +619,8 @@ struct mlx5_cqe64 {
 	u8		op_own;
 };
 
+#define	MLX5_CQE_TSTMP_PTP	(1ULL << 63)
+
 static inline bool get_cqe_lro_timestamp_valid(struct mlx5_cqe64 *cqe)
 {
 	return (cqe->lro_tcppsh_abort_dupack >> 7) & 1;

Modified: head/sys/dev/mlx5/mlx5_en/en.h
==============================================================================
--- head/sys/dev/mlx5/mlx5_en/en.h	Wed Nov 29 09:40:11 2017	(r326362)
+++ head/sys/dev/mlx5/mlx5_en/en.h	Wed Nov 29 10:04:11 2017	(r326363)
@@ -650,6 +650,16 @@ struct mlx5e_flow_tables {
 	struct mlx5e_flow_table inner_rss;
 };
 
+#define	MLX5E_TSTMP_PREC 10
+
+struct mlx5e_clbr_point {
+	uint64_t base_curr;
+	uint64_t base_prev;
+	uint64_t clbr_hw_prev;
+	uint64_t clbr_hw_curr;
+	u_int clbr_gen;
+};
+
 struct mlx5e_priv {
 	/* priv data path fields - start */
 	int	order_base_2_num_channels;
@@ -704,6 +714,12 @@ struct mlx5e_priv {
 	int	media_active_last;
 
 	struct callout watchdog;
+
+	struct callout tstmp_clbr;
+	int	clbr_done;
+	int	clbr_curr;
+	struct mlx5e_clbr_point clbr_points[2];
+	u_int	clbr_gen;
 };
 
 #define	MLX5E_NET_IP_ALIGN 2

Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
==============================================================================
--- head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c	Wed Nov 29 09:40:11 2017	(r326362)
+++ head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c	Wed Nov 29 10:04:11 2017	(r326363)
@@ -154,6 +154,8 @@ static const struct {
 
 MALLOC_DEFINE(M_MLX5EN, "MLX5EN", "MLX5 Ethernet");
 
+static SYSCTL_NODE(_hw, OID_AUTO, mlx5, CTLFLAG_RW, 0, "MLX5 driver parameters");
+
 static void
 mlx5e_update_carrier(struct mlx5e_priv *priv)
 {
@@ -637,6 +639,109 @@ mlx5e_disable_async_events(struct mlx5e_priv *priv)
 	mtx_unlock(&priv->async_events_mtx);
 }
 
+static void mlx5e_calibration_callout(void *arg);
+static int mlx5e_calibration_duration = 20;
+static int mlx5e_fast_calibration = 1;
+static int mlx5e_normal_calibration = 30;
+
+static SYSCTL_NODE(_hw_mlx5, OID_AUTO, calibr, CTLFLAG_RW, 0,
+    "MLX5 timestamp calibration parameteres");
+
+SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, duration, CTLFLAG_RWTUN,
+    &mlx5e_calibration_duration, 0,
+    "Duration of initial calibration");
+SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, fast, CTLFLAG_RWTUN,
+    &mlx5e_fast_calibration, 0,
+    "Recalibration interval during initial calibration");
+SYSCTL_INT(_hw_mlx5_calibr, OID_AUTO, normal, CTLFLAG_RWTUN,
+    &mlx5e_normal_calibration, 0,
+    "Recalibration interval during normal operations");
+
+/*
+ * Ignites the calibration process.
+ */
+static void
+mlx5e_reset_calibration_callout(struct mlx5e_priv *priv)
+{
+
+	if (priv->clbr_done == 0)
+		mlx5e_calibration_callout(priv);
+	else
+		callout_reset_curcpu(&priv->tstmp_clbr, (priv->clbr_done <
+		    mlx5e_calibration_duration ? mlx5e_fast_calibration :
+		    mlx5e_normal_calibration) * hz, mlx5e_calibration_callout,
+		    priv);
+}
+
+static uint64_t
+mlx5e_timespec2usec(const struct timespec *ts)
+{
+
+	return ((uint64_t)ts->tv_sec * 1000000000 + ts->tv_nsec);
+}
+
+static uint64_t
+mlx5e_hw_clock(struct mlx5e_priv *priv)
+{
+	struct mlx5_init_seg *iseg;
+	uint32_t hw_h, hw_h1, hw_l;
+
+	iseg = priv->mdev->iseg;
+	do {
+		hw_h = ioread32be(&iseg->internal_timer_h);
+		hw_l = ioread32be(&iseg->internal_timer_l);
+		hw_h1 = ioread32be(&iseg->internal_timer_h);
+	} while (hw_h1 != hw_h);
+	return (((uint64_t)hw_h << 32) | hw_l);
+}
+
+/*
+ * The calibration callout, it runs either in the context of the
+ * thread which enables calibration, or in callout.  It takes the
+ * snapshot of system and adapter clocks, then advances the pointers to
+ * the calibration point to allow rx path to read the consistent data
+ * lockless.
+ */
+static void
+mlx5e_calibration_callout(void *arg)
+{
+	struct mlx5e_priv *priv;
+	struct mlx5e_clbr_point *next, *curr;
+	struct timespec ts;
+	int clbr_curr_next;
+
+	priv = arg;
+	curr = &priv->clbr_points[priv->clbr_curr];
+	clbr_curr_next = priv->clbr_curr + 1;
+	if (clbr_curr_next >= nitems(priv->clbr_points))
+		clbr_curr_next = 0;
+	next = &priv->clbr_points[clbr_curr_next];
+
+	next->base_prev = curr->base_curr;
+	next->clbr_hw_prev = curr->clbr_hw_curr;
+
+	next->clbr_hw_curr = mlx5e_hw_clock(priv);
+	if (((next->clbr_hw_curr - curr->clbr_hw_prev) >> MLX5E_TSTMP_PREC) ==
+	    0) {
+		if_printf(priv->ifp, "HW failed tstmp frozen %#jx %#jx,"
+		    "disabling\n", next->clbr_hw_curr, curr->clbr_hw_prev);
+		priv->clbr_done = 0;
+		return;
+	}
+
+	nanouptime(&ts);
+	next->base_curr = mlx5e_timespec2usec(&ts);
+
+	curr->clbr_gen = 0;
+	atomic_thread_fence_rel();
+	priv->clbr_curr = clbr_curr_next;
+	atomic_store_rel_int(&next->clbr_gen, ++(priv->clbr_gen));
+
+	if (priv->clbr_done < mlx5e_calibration_duration)
+		priv->clbr_done++;
+	mlx5e_reset_calibration_callout(priv);
+}
+
 static const char *mlx5e_rq_stats_desc[] = {
 	MLX5E_RQ_STATS(MLX5E_STATS_DESC)
 };
@@ -2693,6 +2798,16 @@ mlx5e_ioctl(struct ifnet *ifp, u_long command, caddr_t
 				mlx5e_open_locked(ifp);
 			}
 		}
+		if (mask & IFCAP_HWRXTSTMP) {
+			ifp->if_capenable ^= IFCAP_HWRXTSTMP;
+			if (ifp->if_capenable & IFCAP_HWRXTSTMP) {
+				if (priv->clbr_done == 0)
+					mlx5e_reset_calibration_callout(priv);
+			} else {
+				callout_drain(&priv->tstmp_clbr);
+				priv->clbr_done = 0;
+			}
+		}
 out:
 		PRIV_UNLOCK(priv);
 		break;
@@ -3198,7 +3313,7 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev)
 	ifp->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU;
 	ifp->if_capabilities |= IFCAP_LRO;
 	ifp->if_capabilities |= IFCAP_TSO | IFCAP_VLAN_HWTSO;
-	ifp->if_capabilities |= IFCAP_HWSTATS;
+	ifp->if_capabilities |= IFCAP_HWSTATS | IFCAP_HWRXTSTMP;
 
 	/* set TSO limits so that we don't have to drop TX packets */
 	ifp->if_hw_tsomax = MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
@@ -3347,6 +3462,13 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev)
 	mlx5e_update_stats(priv);
 	mtx_unlock(&priv->async_events_mtx);
 
+	SYSCTL_ADD_INT(&priv->sysctl_ctx, SYSCTL_CHILDREN(priv->sysctl_ifnet),
+	    OID_AUTO, "rx_clbr_done", CTLFLAG_RD,
+	    &priv->clbr_done, 0,
+	    "RX timestamps calibration state");
+	callout_init(&priv->tstmp_clbr, CALLOUT_DIRECT);
+	mlx5e_reset_calibration_callout(priv);
+
 	return (priv);
 
 err_dealloc_transport_domain:
@@ -3390,6 +3512,8 @@ mlx5e_destroy_ifp(struct mlx5_core_dev *mdev, void *vp
 
 	/* stop watchdog timer */
 	callout_drain(&priv->watchdog);
+
+	callout_drain(&priv->tstmp_clbr);
 
 	if (priv->vlan_attach != NULL)
 		EVENTHANDLER_DEREGISTER(vlan_config, priv->vlan_attach);

Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
==============================================================================
--- head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c	Wed Nov 29 09:40:11 2017	(r326362)
+++ head/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c	Wed Nov 29 10:04:11 2017	(r326363)
@@ -179,13 +179,43 @@ mlx5e_lro_update_hdr(struct mbuf *mb, struct mlx5_cqe6
 	/* TODO: handle tcp checksum */
 }
 
+static uint64_t
+mlx5e_mbuf_tstmp(struct mlx5e_priv *priv, uint64_t hw_tstmp)
+{
+	struct mlx5e_clbr_point *cp;
+	uint64_t a1, a2, res;
+	u_int gen;
+
+	do {
+		cp = &priv->clbr_points[priv->clbr_curr];
+		gen = atomic_load_acq_int(&cp->clbr_gen);
+		a1 = (hw_tstmp - cp->clbr_hw_prev) >> MLX5E_TSTMP_PREC;
+		a2 = (cp->base_curr - cp->base_prev) >> MLX5E_TSTMP_PREC;
+		res = (a1 * a2) << MLX5E_TSTMP_PREC;
+
+		/*
+		 * Divisor cannot be zero because calibration callback
+		 * checks for the condition and disables timestamping
+		 * if clock halted.
+		 */
+		res /= (cp->clbr_hw_curr - cp->clbr_hw_prev) >>
+		    MLX5E_TSTMP_PREC;
+
+		res += cp->base_prev;
+		atomic_thread_fence_acq();
+	} while (gen == 0 || gen != cp->clbr_gen);
+	return (res);
+}
+
 static inline void
 mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe,
     struct mlx5e_rq *rq, struct mbuf *mb,
     u32 cqe_bcnt)
 {
 	struct ifnet *ifp = rq->ifp;
+	struct mlx5e_channel *c;
 	int lro_num_seg;	/* HW LRO session aggregated packets counter */
+	uint64_t tstmp;
 
 	lro_num_seg = be32_to_cpu(cqe->srqn) >> 24;
 	if (lro_num_seg > 1) {
@@ -249,6 +279,21 @@ mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe,
 	if (cqe_has_vlan(cqe)) {
 		mb->m_pkthdr.ether_vtag = be16_to_cpu(cqe->vlan_info);
 		mb->m_flags |= M_VLANTAG;
+	}
+
+	c = container_of(rq, struct mlx5e_channel, rq);
+	if (c->priv->clbr_done >= 2) {
+		tstmp = mlx5e_mbuf_tstmp(c->priv, be64_to_cpu(cqe->timestamp));
+		if ((tstmp & MLX5_CQE_TSTMP_PTP) != 0) {
+			/*
+			 * Timestamp was taken on the packet entrance,
+			 * instead of the cqe generation.
+			 */
+			tstmp &= ~MLX5_CQE_TSTMP_PTP;
+			mb->m_flags |= M_TSTMP_HPREC;
+		}
+		mb->m_pkthdr.rcv_tstmp = tstmp;
+		mb->m_flags |= M_TSTMP;
 	}
 }
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201711291004.vATA4BBu008365>