Date: Thu, 16 May 2019 18:28:13 +0000 (UTC) From: Hans Petter Selasky <hselasky@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-11@freebsd.org Subject: svn commit: r347880 - in stable/11/sys/dev/mlx5: . mlx5_core Message-ID: <201905161828.x4GISDGZ041083@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: hselasky Date: Thu May 16 18:28:12 2019 New Revision: 347880 URL: https://svnweb.freebsd.org/changeset/base/347880 Log: MFC r347323: Fix race between driver unload and dumping firmware in mlx5core. Present code uses lock-less accesses to the dump data to prevent top level ioctls from blocking bottom-level call to dump. Unfortunately, this depends on the type stability of the dump data structure, which makes it non-functional during driver teardown. Switch to the mutex locking scheme where top levels use the mutex in the bound regions, while copyouts and drain for completion utilize condvars. The mutex lifetime is guaranteed to be strictly larger than the time interval where driver can initiate dump, and most of the control fields of the old struct mlx5_dump_data are directly embedded into struct mlx5_core_dev. Submitted by: kib@ Sponsored by: Mellanox Technologies Modified: stable/11/sys/dev/mlx5/driver.h stable/11/sys/dev/mlx5/mlx5_core/mlx5_fwdump.c stable/11/sys/dev/mlx5/mlx5_core/mlx5_main.c Directory Properties: stable/11/ (props changed) Modified: stable/11/sys/dev/mlx5/driver.h ============================================================================== --- stable/11/sys/dev/mlx5/driver.h Thu May 16 18:27:27 2019 (r347879) +++ stable/11/sys/dev/mlx5/driver.h Thu May 16 18:28:12 2019 (r347880) @@ -638,7 +638,6 @@ struct mlx5_special_contexts { }; struct mlx5_flow_root_namespace; -struct mlx5_dump_data; struct mlx5_core_dev { struct pci_dev *pdev; /* sync pci state */ @@ -678,7 +677,12 @@ struct mlx5_core_dev { struct mlx5_flow_root_namespace *sniffer_rx_root_ns; struct mlx5_flow_root_namespace *sniffer_tx_root_ns; u32 num_q_counter_allocated[MLX5_INTERFACE_NUMBER]; - struct mlx5_dump_data *dump_data; + const struct mlx5_crspace_regmap *dump_rege; + uint32_t *dump_data; + unsigned dump_size; + bool dump_valid; + bool dump_copyout; + struct mtx dump_lock; struct sysctl_ctx_list sysctl_ctx; int msix_eqvec; Modified: stable/11/sys/dev/mlx5/mlx5_core/mlx5_fwdump.c ============================================================================== --- stable/11/sys/dev/mlx5/mlx5_core/mlx5_fwdump.c Thu May 16 18:27:27 2019 (r347879) +++ stable/11/sys/dev/mlx5/mlx5_core/mlx5_fwdump.c Thu May 16 18:28:12 2019 (r347880) @@ -39,14 +39,6 @@ extern const struct mlx5_crspace_regmap mlx5_crspace_r extern const struct mlx5_crspace_regmap mlx5_crspace_regmap_mt4115[]; extern const struct mlx5_crspace_regmap mlx5_crspace_regmap_connectx5[]; -struct mlx5_dump_data { - const struct mlx5_crspace_regmap *rege; - uint32_t *dump; - unsigned dump_size; - int dump_valid; - struct mtx dump_lock; -}; - static MALLOC_DEFINE(M_MLX5_DUMP, "MLX5DUMP", "MLX5 Firmware dump"); static unsigned @@ -61,20 +53,20 @@ mlx5_fwdump_getsize(const struct mlx5_crspace_regmap * } static void -mlx5_fwdump_destroy_dd(struct mlx5_dump_data *dd) +mlx5_fwdump_destroy_dd(struct mlx5_core_dev *mdev) { - mtx_destroy(&dd->dump_lock); - free(dd->dump, M_MLX5_DUMP); - free(dd, M_MLX5_DUMP); + mtx_assert(&mdev->dump_lock, MA_OWNED); + free(mdev->dump_data, M_MLX5_DUMP); + mdev->dump_data = NULL; } void mlx5_fwdump_prep(struct mlx5_core_dev *mdev) { - struct mlx5_dump_data *dd; int error; + mdev->dump_data = NULL; error = mlx5_vsc_find_cap(mdev); if (error != 0) { /* Inability to create a firmware dump is not fatal. */ @@ -82,47 +74,39 @@ mlx5_fwdump_prep(struct mlx5_core_dev *mdev) "mlx5_fwdump_prep failed %d\n", error); return; } - dd = malloc(sizeof(struct mlx5_dump_data), M_MLX5_DUMP, M_WAITOK); switch (pci_get_device(mdev->pdev->dev.bsddev)) { case 0x1013: - dd->rege = mlx5_crspace_regmap_mt4115; + mdev->dump_rege = mlx5_crspace_regmap_mt4115; break; case 0x1015: - dd->rege = mlx5_crspace_regmap_mt4117; + mdev->dump_rege = mlx5_crspace_regmap_mt4117; break; case 0x1017: case 0x1019: - dd->rege = mlx5_crspace_regmap_connectx5; + mdev->dump_rege = mlx5_crspace_regmap_connectx5; break; default: - free(dd, M_MLX5_DUMP); return; /* silently fail, do not prevent driver attach */ } - dd->dump_size = mlx5_fwdump_getsize(dd->rege); - dd->dump = malloc(dd->dump_size * sizeof(uint32_t), M_MLX5_DUMP, - M_WAITOK | M_ZERO); - dd->dump_valid = 0; - mtx_init(&dd->dump_lock, "mlx5dmp", NULL, MTX_DEF | MTX_NEW); - if (atomic_cmpset_rel_ptr((uintptr_t *)&mdev->dump_data, 0, - (uintptr_t)dd) == 0) - mlx5_fwdump_destroy_dd(dd); + mdev->dump_size = mlx5_fwdump_getsize(mdev->dump_rege); + mdev->dump_data = malloc(mdev->dump_size * sizeof(uint32_t), + M_MLX5_DUMP, M_WAITOK | M_ZERO); + mdev->dump_valid = false; + mdev->dump_copyout = false; } void mlx5_fwdump(struct mlx5_core_dev *mdev) { - struct mlx5_dump_data *dd; const struct mlx5_crspace_regmap *r; uint32_t i, ri; int error; dev_info(&mdev->pdev->dev, "Issuing FW dump\n"); - dd = (struct mlx5_dump_data *)atomic_load_acq_ptr((uintptr_t *) - &mdev->dump_data); - if (dd == NULL) - return; - mtx_lock(&dd->dump_lock); - if (dd->dump_valid) { + mtx_lock(&mdev->dump_lock); + if (mdev->dump_data == NULL) + goto failed; + if (mdev->dump_valid) { /* only one dump */ dev_warn(&mdev->pdev->dev, "Only one FW dump can be captured aborting FW dump\n"); @@ -136,37 +120,51 @@ mlx5_fwdump(struct mlx5_core_dev *mdev) error = mlx5_vsc_set_space(mdev, MLX5_VSC_DOMAIN_PROTECTED_CRSPACE); if (error != 0) goto unlock_vsc; - for (i = 0, r = dd->rege; r->cnt != 0; r++) { + for (i = 0, r = mdev->dump_rege; r->cnt != 0; r++) { for (ri = 0; ri < r->cnt; ri++) { error = mlx5_vsc_read(mdev, r->addr + ri * 4, - &dd->dump[i]); + &mdev->dump_data[i]); if (error != 0) goto unlock_vsc; i++; } } - atomic_store_rel_int(&dd->dump_valid, 1); + mdev->dump_valid = true; unlock_vsc: mlx5_vsc_unlock(mdev); failed: - mtx_unlock(&dd->dump_lock); + mtx_unlock(&mdev->dump_lock); } void mlx5_fwdump_clean(struct mlx5_core_dev *mdev) { - struct mlx5_dump_data *dd; - for (;;) { - dd = mdev->dump_data; - if (dd == NULL) - return; - if (atomic_cmpset_ptr((uintptr_t *)&mdev->dump_data, - (uintptr_t)dd, 0) == 1) { - mlx5_fwdump_destroy_dd(dd); - return; + mtx_lock(&mdev->dump_lock); + while (mdev->dump_copyout) + msleep(&mdev->dump_copyout, &mdev->dump_lock, 0, "mlx5fwc", 0); + mlx5_fwdump_destroy_dd(mdev); + mtx_unlock(&mdev->dump_lock); +} + +static int +mlx5_fwdump_reset(struct mlx5_core_dev *mdev) +{ + int error; + + error = 0; + mtx_lock(&mdev->dump_lock); + if (mdev->dump_data != NULL) { + while (mdev->dump_copyout) { + msleep(&mdev->dump_copyout, &mdev->dump_lock, + 0, "mlx5fwr", 0); } + mdev->dump_valid = false; + } else { + error = ENOENT; } + mtx_unlock(&mdev->dump_lock); + return (error); } static int @@ -190,29 +188,37 @@ mlx5_dbsf_to_core(const struct mlx5_tool_addr *devaddr } static int -mlx5_fwdump_copyout(struct mlx5_dump_data *dd, struct mlx5_fwdump_get *fwg) +mlx5_fwdump_copyout(struct mlx5_core_dev *mdev, struct mlx5_fwdump_get *fwg) { const struct mlx5_crspace_regmap *r; struct mlx5_fwdump_reg rv, *urv; uint32_t i, ri; int error; - if (dd == NULL) + mtx_lock(&mdev->dump_lock); + if (mdev->dump_data == NULL) { + mtx_unlock(&mdev->dump_lock); return (ENOENT); + } if (fwg->buf == NULL) { - fwg->reg_filled = dd->dump_size; + fwg->reg_filled = mdev->dump_size; + mtx_unlock(&mdev->dump_lock); return (0); } - if (atomic_load_acq_int(&dd->dump_valid) == 0) + if (!mdev->dump_valid) { + mtx_unlock(&mdev->dump_lock); return (ENOENT); + } + mdev->dump_copyout = true; + mtx_unlock(&mdev->dump_lock); urv = fwg->buf; - for (i = 0, r = dd->rege; r->cnt != 0; r++) { + for (i = 0, r = mdev->dump_rege; r->cnt != 0; r++) { for (ri = 0; ri < r->cnt; ri++) { if (i >= fwg->reg_cnt) goto out; rv.addr = r->addr + ri * 4; - rv.val = dd->dump[i]; + rv.val = mdev->dump_data[i]; error = copyout(&rv, urv, sizeof(rv)); if (error != 0) return (error); @@ -222,6 +228,10 @@ mlx5_fwdump_copyout(struct mlx5_dump_data *dd, struct } out: fwg->reg_filled = i; + mtx_lock(&mdev->dump_lock); + mdev->dump_copyout = false; + wakeup(&mdev->dump_copyout); + mtx_unlock(&mdev->dump_lock); return (0); } @@ -250,7 +260,6 @@ mlx5_ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t d struct mlx5_core_dev *mdev; struct mlx5_fwdump_get *fwg; struct mlx5_tool_addr *devaddr; - struct mlx5_dump_data *dd; struct mlx5_fw_update *fu; struct firmware fake_fw; int error; @@ -267,7 +276,7 @@ mlx5_ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t d error = mlx5_dbsf_to_core(devaddr, &mdev); if (error != 0) break; - error = mlx5_fwdump_copyout(mdev->dump_data, fwg); + error = mlx5_fwdump_copyout(mdev, fwg); break; case MLX5_FWDUMP_RESET: if ((fflag & FWRITE) == 0) { @@ -276,13 +285,8 @@ mlx5_ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t d } devaddr = (struct mlx5_tool_addr *)data; error = mlx5_dbsf_to_core(devaddr, &mdev); - if (error != 0) - break; - dd = mdev->dump_data; - if (dd != NULL) - atomic_store_rel_int(&dd->dump_valid, 0); - else - error = ENOENT; + if (error == 0) + error = mlx5_fwdump_reset(mdev); break; case MLX5_FWDUMP_FORCE: if ((fflag & FWRITE) == 0) { Modified: stable/11/sys/dev/mlx5/mlx5_core/mlx5_main.c ============================================================================== --- stable/11/sys/dev/mlx5/mlx5_core/mlx5_main.c Thu May 16 18:27:27 2019 (r347879) +++ stable/11/sys/dev/mlx5/mlx5_core/mlx5_main.c Thu May 16 18:28:12 2019 (r347880) @@ -1283,6 +1283,7 @@ static int init_one(struct pci_dev *pdev, spin_lock_init(&priv->ctx_lock); mutex_init(&dev->pci_status_mutex); mutex_init(&dev->intf_state_mutex); + mtx_init(&dev->dump_lock, "mlx5dmp", NULL, MTX_DEF | MTX_NEW); err = mlx5_pci_init(dev, priv); if (err) { device_printf(bsddev, "ERR: mlx5_pci_init failed %d\n", err); @@ -1317,6 +1318,7 @@ close_pci: mlx5_pci_close(dev, priv); clean_dev: sysctl_ctx_free(&dev->sysctl_ctx); + mtx_destroy(&dev->dump_lock); kfree(dev); return err; } @@ -1332,10 +1334,11 @@ static void remove_one(struct pci_dev *pdev) return; } - mlx5_fwdump_clean(dev); mlx5_pagealloc_cleanup(dev); mlx5_health_cleanup(dev); + mlx5_fwdump_clean(dev); mlx5_pci_close(dev, priv); + mtx_destroy(&dev->dump_lock); pci_set_drvdata(pdev, NULL); sysctl_ctx_free(&dev->sysctl_ctx); kfree(dev);
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201905161828.x4GISDGZ041083>