Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 30 Mar 2018 19:22:42 +0000 (UTC)
From:      Hans Petter Selasky <hselasky@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-11@freebsd.org
Subject:   svn commit: r331814 - in stable/11/sys/dev/mlx5: . mlx5_core
Message-ID:  <201803301922.w2UJMgmq004440@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: hselasky
Date: Fri Mar 30 19:22:42 2018
New Revision: 331814
URL: https://svnweb.freebsd.org/changeset/base/331814

Log:
  MFC r331451:
  Issue a software reset on firmware assert in mlx5core.
  
  If a FW assert is considered fatal, indicated by a new bit in the
  health buffer, reset the FW. After the reset, follow the normal
  recovery flow.
  
  Submitted by:	slavash@
  Sponsored by:	Mellanox Technologies

Modified:
  stable/11/sys/dev/mlx5/device.h
  stable/11/sys/dev/mlx5/mlx5_core/mlx5_health.c
Directory Properties:
  stable/11/   (props changed)

Modified: stable/11/sys/dev/mlx5/device.h
==============================================================================
--- stable/11/sys/dev/mlx5/device.h	Fri Mar 30 19:21:19 2018	(r331813)
+++ stable/11/sys/dev/mlx5/device.h	Fri Mar 30 19:22:42 2018	(r331814)
@@ -410,6 +410,10 @@ struct mlx5_cmd_layout {
 	u8		status_own;
 };
 
+enum mlx5_fatal_assert_bit_offsets {
+	MLX5_RFR_OFFSET = 31,
+};
+
 struct mlx5_health_buffer {
 	__be32		assert_var[5];
 	__be32		rsvd0[3];
@@ -418,10 +422,18 @@ struct mlx5_health_buffer {
 	__be32		rsvd1[2];
 	__be32		fw_ver;
 	__be32		hw_id;
-	__be32		rsvd2;
+	__be32		rfr;
 	u8		irisc_index;
 	u8		synd;
 	__be16		ext_synd;
+};
+
+enum mlx5_initializing_bit_offsets {
+	MLX5_FW_RESET_SUPPORTED_OFFSET = 30,
+};
+
+enum mlx5_cmd_addr_l_sz_offset {
+	MLX5_NIC_IFC_OFFSET = 8,
 };
 
 struct mlx5_init_seg {

Modified: stable/11/sys/dev/mlx5/mlx5_core/mlx5_health.c
==============================================================================
--- stable/11/sys/dev/mlx5/mlx5_core/mlx5_health.c	Fri Mar 30 19:21:19 2018	(r331813)
+++ stable/11/sys/dev/mlx5/mlx5_core/mlx5_health.c	Fri Mar 30 19:22:42 2018	(r331814)
@@ -56,6 +56,7 @@ enum  {
 	MLX5_SENSOR_PCI_ERR		= 2,
 	MLX5_SENSOR_NIC_DISABLED	= 3,
 	MLX5_SENSOR_NIC_SW_RESET	= 4,
+	MLX5_SENSOR_FW_SYND_RFR		= 5,
 };
 
 static u8 get_nic_mode(struct mlx5_core_dev *dev)
@@ -63,6 +64,18 @@ static u8 get_nic_mode(struct mlx5_core_dev *dev)
 	return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 7;
 }
 
+static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	struct mlx5_health_buffer __iomem *h = health->health;
+	u32 rfr = ioread32be(&h->rfr) >> MLX5_RFR_OFFSET;
+	u8 synd = ioread8(&h->synd);
+
+	if (rfr && synd)
+		mlx5_core_dbg(dev, "FW requests reset, synd: %d\n", synd);
+	return rfr && synd;
+}
+
 static void mlx5_trigger_cmd_completions(struct mlx5_core_dev *dev)
 {
 	unsigned long flags;
@@ -115,10 +128,44 @@ static u32 check_fatal_sensors(struct mlx5_core_dev *d
 		return MLX5_SENSOR_NIC_DISABLED;
 	if (sensor_nic_sw_reset(dev))
 		return MLX5_SENSOR_NIC_SW_RESET;
+	if (sensor_fw_synd_rfr(dev))
+		return MLX5_SENSOR_FW_SYND_RFR;
 
 	return MLX5_SENSOR_NO_ERR;
 }
 
+static void reset_fw_if_needed(struct mlx5_core_dev *dev)
+{
+	bool supported = (ioread32be(&dev->iseg->initializing) >>
+			  MLX5_FW_RESET_SUPPORTED_OFFSET) & 1;
+	u32 cmdq_addr, fatal_error;
+
+	if (!supported)
+		return;
+
+	/* The reset only needs to be issued by one PF. The health buffer is
+	 * shared between all functions, and will be cleared during a reset.
+	 * Check again to avoid a redundant 2nd reset. If the fatal erros was
+	 * PCI related a reset won't help.
+	 */
+	fatal_error = check_fatal_sensors(dev);
+	if (fatal_error == MLX5_SENSOR_PCI_COMM_ERR ||
+	    fatal_error == MLX5_SENSOR_NIC_DISABLED ||
+	    fatal_error == MLX5_SENSOR_NIC_SW_RESET) {
+		mlx5_core_warn(dev, "Not issuing FW reset. Either it's already done or won't help.");
+		return;
+	}
+
+	mlx5_core_warn(dev, "Issuing FW Reset\n");
+	/* Write the NIC interface field to initiate the reset, the command
+	 * interface address also resides here, don't overwrite it.
+	 */
+	cmdq_addr = ioread32be(&dev->iseg->cmdq_addr_l_sz);
+	iowrite32be((cmdq_addr & 0xFFFFF000) |
+		    MLX5_NIC_IFC_SW_RESET << MLX5_NIC_IFC_OFFSET,
+		    &dev->iseg->cmdq_addr_l_sz);
+}
+
 void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force)
 {
 	mutex_lock(&dev->intf_state_mutex);
@@ -130,6 +177,7 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev,
 	if (!force)
 		mlx5_core_err(dev, "internal state error detected\n");
 	if (check_fatal_sensors(dev) || force) {
+		reset_fw_if_needed(dev);
 		dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
 		mlx5_trigger_cmd_completions(dev);
 	}
@@ -230,11 +278,14 @@ static void health_care(struct work_struct *work)
 	recover_delay = msecs_to_jiffies(get_recovery_delay(dev));
 
 	spin_lock_irqsave(&health->wq_lock, flags);
-	if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags))
+	if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags)) {
+		mlx5_core_warn(dev, "Scheduling recovery work with %lums delay\n",
+			       recover_delay);
 		schedule_delayed_work(&health->recover_work, recover_delay);
-	else
+	} else {
 		dev_err(&dev->pdev->dev,
 			"new health works are not permitted at this stage\n");
+	}
 	spin_unlock_irqrestore(&health->wq_lock, flags);
 }
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201803301922.w2UJMgmq004440>