Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 16 May 2019 15:46:05 +0000 (UTC)
From:      Hans Petter Selasky <hselasky@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-12@freebsd.org
Subject:   svn commit: r347717 - in stable/12/sys/dev/mlx5: . mlx5_core
Message-ID:  <201905161546.x4GFk5UM046334@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: hselasky
Date: Thu May 16 15:46:04 2019
New Revision: 347717
URL: https://svnweb.freebsd.org/changeset/base/347717

Log:
  MFC r347253:
  Protect from infinite sw-reset loop in mlx5core.
  
  Avoid an infinite software firmware reset loop that may be caused by a
  hardware bug by limiting the maximum number of resets.
  The counter between resets is reset by request for reset, and not by a
  successful reset.
  The interval between two resets can be configured via sysctl:
  hw.mlx5.sw_reset_timeout
  which is global to all mlx5 devices in the system.
  
  Submitted by:	slavash@
  Sponsored by:	Mellanox Technologies

Modified:
  stable/12/sys/dev/mlx5/driver.h
  stable/12/sys/dev/mlx5/mlx5_core/mlx5_health.c
Directory Properties:
  stable/12/   (props changed)

Modified: stable/12/sys/dev/mlx5/driver.h
==============================================================================
--- stable/12/sys/dev/mlx5/driver.h	Thu May 16 15:45:10 2019	(r347716)
+++ stable/12/sys/dev/mlx5/driver.h	Thu May 16 15:46:04 2019	(r347717)
@@ -536,6 +536,7 @@ struct mlx5_core_health {
 	unsigned long			flags;
 	struct work_struct		work;
 	struct delayed_work		recover_work;
+	unsigned int			last_reset_req;
 };
 
 #ifdef RATELIMIT

Modified: stable/12/sys/dev/mlx5/mlx5_core/mlx5_health.c
==============================================================================
--- stable/12/sys/dev/mlx5/mlx5_core/mlx5_health.c	Thu May 16 15:45:10 2019	(r347716)
+++ stable/12/sys/dev/mlx5/mlx5_core/mlx5_health.c	Thu May 16 15:46:04 2019	(r347717)
@@ -64,6 +64,12 @@ SYSCTL_INT(_hw_mlx5, OID_AUTO, fw_reset_enable, CTLFLA
     &mlx5_fw_reset_enable, 0,
     "Enable firmware reset");
 
+static unsigned int sw_reset_to = 1200;
+SYSCTL_UINT(_hw_mlx5, OID_AUTO, sw_reset_timeout, CTLFLAG_RWTUN,
+    &sw_reset_to, 0,
+    "Minimum timeout in seconds between two firmware resets");
+
+
 static int lock_sem_sw_reset(struct mlx5_core_dev *dev)
 {
 	int ret;
@@ -218,6 +224,32 @@ static void reset_fw_if_needed(struct mlx5_core_dev *d
 		    &dev->iseg->cmdq_addr_l_sz);
 }
 
+static bool
+mlx5_health_allow_reset(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	unsigned int delta;
+	bool ret;
+
+	if (health->last_reset_req != 0) {
+		delta = ticks - health->last_reset_req;
+		delta /= hz;
+		ret = delta >= sw_reset_to;
+	} else {
+		ret = true;
+	}
+
+	/*
+	 * In principle, ticks may be 0. Setting it to off by one (-1)
+	 * to prevent certain reset in next request.
+	 */
+	health->last_reset_req = ticks ? : -1;
+	if (!ret)
+		mlx5_core_warn(dev, "Firmware reset elided due to "
+		    "auto-reset frequency threshold.\n");
+	return (ret);
+}
+
 #define MLX5_CRDUMP_WAIT_MS	60000
 #define MLX5_FW_RESET_WAIT_MS	1000
 #define MLX5_NIC_STATE_POLL_MS	5
@@ -243,7 +275,8 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev,
 	if (force)
 		goto err_state_done;
 
-	if (fatal_error == MLX5_SENSOR_FW_SYND_RFR) {
+	if (fatal_error == MLX5_SENSOR_FW_SYND_RFR &&
+	    mlx5_health_allow_reset(dev)) {
 		/* Get cr-dump and reset FW semaphore */
 		if (mlx5_core_is_pf(dev))
 			lock = lock_sem_sw_reset(dev);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201905161546.x4GFk5UM046334>