Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 5 Dec 2018 13:43:38 +0000 (UTC)
From:      Slava Shwartsman <slavash@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r341560 - in head/sys/dev/mlx5: . mlx5_core
Message-ID:  <201812051343.wB5DhcYr081402@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: slavash
Date: Wed Dec  5 13:43:37 2018
New Revision: 341560
URL: https://svnweb.freebsd.org/changeset/base/341560

Log:
  mlx5: Fix use-after-free in self-healing flow
  
  When the mlx5 health mechanism detects a problem while the driver
  is in the middle of init_one or remove_one, the driver needs to prevent
  the health mechanism from scheduling future work; if future work
  is scheduled, there is a problem with use-after-free: the system WQ
  tries to run the work item (which has been freed) at the scheduled
  future time.
  
  Prevent this by disabling work item scheduling in the health mechanism
  when the driver is in the middle of init_one() or remove_one().
  
  Approved by:    hselasky (mentor)
  MFC after:      1 week
  Sponsored by:   Mellanox Technologies

Modified:
  head/sys/dev/mlx5/driver.h
  head/sys/dev/mlx5/mlx5_core/mlx5_health.c
  head/sys/dev/mlx5/mlx5_core/mlx5_main.c

Modified: head/sys/dev/mlx5/driver.h
==============================================================================
--- head/sys/dev/mlx5/driver.h	Wed Dec  5 13:43:07 2018	(r341559)
+++ head/sys/dev/mlx5/driver.h	Wed Dec  5 13:43:37 2018	(r341560)
@@ -923,7 +923,7 @@ void mlx5_unmap_free_uar(struct mlx5_core_dev *mdev, s
 void mlx5_health_cleanup(struct mlx5_core_dev *dev);
 int mlx5_health_init(struct mlx5_core_dev *dev);
 void mlx5_start_health_poll(struct mlx5_core_dev *dev);
-void mlx5_stop_health_poll(struct mlx5_core_dev *dev);
+void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health);
 void mlx5_drain_health_wq(struct mlx5_core_dev *dev);
 void mlx5_drain_health_recovery(struct mlx5_core_dev *dev);
 void mlx5_trigger_health_work(struct mlx5_core_dev *dev);

Modified: head/sys/dev/mlx5/mlx5_core/mlx5_health.c
==============================================================================
--- head/sys/dev/mlx5/mlx5_core/mlx5_health.c	Wed Dec  5 13:43:07 2018	(r341559)
+++ head/sys/dev/mlx5/mlx5_core/mlx5_health.c	Wed Dec  5 13:43:37 2018	(r341560)
@@ -516,9 +516,17 @@ void mlx5_start_health_poll(struct mlx5_core_dev *dev)
 		  round_jiffies(jiffies + MLX5_HEALTH_POLL_INTERVAL));
 }
 
-void mlx5_stop_health_poll(struct mlx5_core_dev *dev)
+void mlx5_stop_health_poll(struct mlx5_core_dev *dev, bool disable_health)
 {
 	struct mlx5_core_health *health = &dev->priv.health;
+	unsigned long flags;
+
+	if (disable_health) {
+		spin_lock_irqsave(&health->wq_lock, flags);
+		set_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags);
+		set_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags);
+		spin_unlock_irqrestore(&health->wq_lock, flags);
+	}
 
 	del_timer_sync(&health->timer);
 }

Modified: head/sys/dev/mlx5/mlx5_core/mlx5_main.c
==============================================================================
--- head/sys/dev/mlx5/mlx5_core/mlx5_main.c	Wed Dec  5 13:43:07 2018	(r341559)
+++ head/sys/dev/mlx5/mlx5_core/mlx5_main.c	Wed Dec  5 13:43:37 2018	(r341560)
@@ -1107,7 +1107,7 @@ err_cleanup_once:
 		mlx5_cleanup_once(dev);
 
 err_stop_poll:
-	mlx5_stop_health_poll(dev);
+	mlx5_stop_health_poll(dev, boot);
 	if (mlx5_cmd_teardown_hca(dev)) {
 		device_printf((&dev->pdev->dev)->bsddev, "ERR: ""tear_down_hca failed, skip cleanup\n");
 		goto out_err;
@@ -1159,7 +1159,7 @@ static int mlx5_unload_one(struct mlx5_core_dev *dev, 
 	mlx5_disable_msix(dev);
         if (cleanup)
                 mlx5_cleanup_once(dev);
-	mlx5_stop_health_poll(dev);
+	mlx5_stop_health_poll(dev, cleanup);
 	err = mlx5_cmd_teardown_hca(dev);
 	if (err) {
 		device_printf((&dev->pdev->dev)->bsddev, "ERR: ""tear_down_hca failed, skip cleanup\n");
@@ -1405,6 +1405,12 @@ static int mlx5_try_fast_unload(struct mlx5_core_dev *
 		mlx5_core_dbg(dev, "Device in internal error state, giving up\n");
 		return -EAGAIN;
 	}
+
+	/* Panic tear down fw command will stop the PCI bus communication
+	 * with the HCA, so the health polll is no longer needed.
+	 */
+	mlx5_drain_health_wq(dev);
+	mlx5_stop_health_poll(dev, false);
 
 	err = mlx5_cmd_force_teardown_hca(dev);
 	if (err) {



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201812051343.wB5DhcYr081402>