Date: Fri, 23 Mar 2018 18:24:09 +0000 (UTC) From: Hans Petter Selasky <hselasky@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r331451 - in head/sys/dev/mlx5: . mlx5_core Message-ID: <201803231824.w2NIO9uH060786@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: hselasky Date: Fri Mar 23 18:24:09 2018 New Revision: 331451 URL: https://svnweb.freebsd.org/changeset/base/331451 Log: Issue a software reset on firmware assert in mlx5core. If a FW assert is considered fatal, indicated by a new bit in the health buffer, reset the FW. After the reset, follow the normal recovery flow. Submitted by: slavash@ MFC after: 1 week Sponsored by: Mellanox Technologies Modified: head/sys/dev/mlx5/device.h head/sys/dev/mlx5/mlx5_core/mlx5_health.c Modified: head/sys/dev/mlx5/device.h ============================================================================== --- head/sys/dev/mlx5/device.h Fri Mar 23 18:24:02 2018 (r331450) +++ head/sys/dev/mlx5/device.h Fri Mar 23 18:24:09 2018 (r331451) @@ -410,6 +410,10 @@ struct mlx5_cmd_layout { u8 status_own; }; +enum mlx5_fatal_assert_bit_offsets { + MLX5_RFR_OFFSET = 31, +}; + struct mlx5_health_buffer { __be32 assert_var[5]; __be32 rsvd0[3]; @@ -418,10 +422,18 @@ struct mlx5_health_buffer { __be32 rsvd1[2]; __be32 fw_ver; __be32 hw_id; - __be32 rsvd2; + __be32 rfr; u8 irisc_index; u8 synd; __be16 ext_synd; +}; + +enum mlx5_initializing_bit_offsets { + MLX5_FW_RESET_SUPPORTED_OFFSET = 30, +}; + +enum mlx5_cmd_addr_l_sz_offset { + MLX5_NIC_IFC_OFFSET = 8, }; struct mlx5_init_seg { Modified: head/sys/dev/mlx5/mlx5_core/mlx5_health.c ============================================================================== --- head/sys/dev/mlx5/mlx5_core/mlx5_health.c Fri Mar 23 18:24:02 2018 (r331450) +++ head/sys/dev/mlx5/mlx5_core/mlx5_health.c Fri Mar 23 18:24:09 2018 (r331451) @@ -56,6 +56,7 @@ enum { MLX5_SENSOR_PCI_ERR = 2, MLX5_SENSOR_NIC_DISABLED = 3, MLX5_SENSOR_NIC_SW_RESET = 4, + MLX5_SENSOR_FW_SYND_RFR = 5, }; static u8 get_nic_mode(struct mlx5_core_dev *dev) @@ -63,6 +64,18 @@ static u8 get_nic_mode(struct mlx5_core_dev *dev) return (ioread32be(&dev->iseg->cmdq_addr_l_sz) >> 8) & 7; } +static bool sensor_fw_synd_rfr(struct mlx5_core_dev *dev) +{ + struct mlx5_core_health *health = &dev->priv.health; + struct mlx5_health_buffer __iomem *h = health->health; + u32 rfr = ioread32be(&h->rfr) >> MLX5_RFR_OFFSET; + u8 synd = ioread8(&h->synd); + + if (rfr && synd) + mlx5_core_dbg(dev, "FW requests reset, synd: %d\n", synd); + return rfr && synd; +} + static void mlx5_trigger_cmd_completions(struct mlx5_core_dev *dev) { unsigned long flags; @@ -115,10 +128,44 @@ static u32 check_fatal_sensors(struct mlx5_core_dev *d return MLX5_SENSOR_NIC_DISABLED; if (sensor_nic_sw_reset(dev)) return MLX5_SENSOR_NIC_SW_RESET; + if (sensor_fw_synd_rfr(dev)) + return MLX5_SENSOR_FW_SYND_RFR; return MLX5_SENSOR_NO_ERR; } +static void reset_fw_if_needed(struct mlx5_core_dev *dev) +{ + bool supported = (ioread32be(&dev->iseg->initializing) >> + MLX5_FW_RESET_SUPPORTED_OFFSET) & 1; + u32 cmdq_addr, fatal_error; + + if (!supported) + return; + + /* The reset only needs to be issued by one PF. The health buffer is + * shared between all functions, and will be cleared during a reset. + * Check again to avoid a redundant 2nd reset. If the fatal erros was + * PCI related a reset won't help. + */ + fatal_error = check_fatal_sensors(dev); + if (fatal_error == MLX5_SENSOR_PCI_COMM_ERR || + fatal_error == MLX5_SENSOR_NIC_DISABLED || + fatal_error == MLX5_SENSOR_NIC_SW_RESET) { + mlx5_core_warn(dev, "Not issuing FW reset. Either it's already done or won't help."); + return; + } + + mlx5_core_warn(dev, "Issuing FW Reset\n"); + /* Write the NIC interface field to initiate the reset, the command + * interface address also resides here, don't overwrite it. + */ + cmdq_addr = ioread32be(&dev->iseg->cmdq_addr_l_sz); + iowrite32be((cmdq_addr & 0xFFFFF000) | + MLX5_NIC_IFC_SW_RESET << MLX5_NIC_IFC_OFFSET, + &dev->iseg->cmdq_addr_l_sz); +} + void mlx5_enter_error_state(struct mlx5_core_dev *dev, bool force) { mutex_lock(&dev->intf_state_mutex); @@ -130,6 +177,7 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev, if (!force) mlx5_core_err(dev, "internal state error detected\n"); if (check_fatal_sensors(dev) || force) { + reset_fw_if_needed(dev); dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR; mlx5_trigger_cmd_completions(dev); } @@ -230,11 +278,14 @@ static void health_care(struct work_struct *work) recover_delay = msecs_to_jiffies(get_recovery_delay(dev)); spin_lock_irqsave(&health->wq_lock, flags); - if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags)) + if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags)) { + mlx5_core_warn(dev, "Scheduling recovery work with %lums delay\n", + recover_delay); schedule_delayed_work(&health->recover_work, recover_delay); - else + } else { dev_err(&dev->pdev->dev, "new health works are not permitted at this stage\n"); + } spin_unlock_irqrestore(&health->wq_lock, flags); }
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201803231824.w2NIO9uH060786>