From owner-svn-src-stable@freebsd.org Mon Oct 29 21:46:16 2018 Return-Path: Delivered-To: svn-src-stable@mailman.ysv.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1]) by mailman.ysv.freebsd.org (Postfix) with ESMTP id E685110EB1D2; Mon, 29 Oct 2018 21:46:15 +0000 (UTC) (envelope-from davidcs@FreeBSD.org) Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org [IPv6:2610:1c1:1:606c::19:3]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client CN "mxrelay.nyi.freebsd.org", Issuer "Let's Encrypt Authority X3" (verified OK)) by mx1.freebsd.org (Postfix) with ESMTPS id 9675084ECF; Mon, 29 Oct 2018 21:46:15 +0000 (UTC) (envelope-from davidcs@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id AFAF6141AE; Mon, 29 Oct 2018 21:46:13 +0000 (UTC) (envelope-from davidcs@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id w9TLkD2l062658; Mon, 29 Oct 2018 21:46:13 GMT (envelope-from davidcs@FreeBSD.org) Received: (from davidcs@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id w9TLkDX4062655; Mon, 29 Oct 2018 21:46:13 GMT (envelope-from davidcs@FreeBSD.org) Message-Id: <201810292146.w9TLkDX4062655@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: davidcs set sender to davidcs@FreeBSD.org using -f From: David C Somayajulu Date: Mon, 29 Oct 2018 21:46:13 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-9@freebsd.org Subject: svn commit: r339887 - stable/9/sys/dev/bxe X-SVN-Group: stable-9 X-SVN-Commit-Author: davidcs X-SVN-Commit-Paths: stable/9/sys/dev/bxe X-SVN-Commit-Revision: 339887 X-SVN-Commit-Repository: base MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-stable@freebsd.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: SVN commit messages for all the -stable branches of the src tree List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 29 Oct 2018 21:46:16 -0000 Author: davidcs Date: Mon Oct 29 21:46:12 2018 New Revision: 339887 URL: https://svnweb.freebsd.org/changeset/base/339887 Log: MFC r339366 Add support for Error Recovery Submitted by:Vaishali.Kulkarni@cavium.com Modified: stable/9/sys/dev/bxe/bxe.c stable/9/sys/dev/bxe/bxe.h stable/9/sys/dev/bxe/bxe_stats.c Directory Properties: stable/9/ (props changed) stable/9/sys/ (props changed) Modified: stable/9/sys/dev/bxe/bxe.c ============================================================================== --- stable/9/sys/dev/bxe/bxe.c Mon Oct 29 21:46:05 2018 (r339886) +++ stable/9/sys/dev/bxe/bxe.c Mon Oct 29 21:46:12 2018 (r339887) @@ -188,6 +188,7 @@ static int bxe_attach(device_t); static int bxe_detach(device_t); static int bxe_shutdown(device_t); + /* * FreeBSD KLD module/device interface event handler method. */ @@ -700,6 +701,9 @@ static void bxe_interrupt_detach(struct bxe_softc * static void bxe_set_rx_mode(struct bxe_softc *sc); static int bxe_init_locked(struct bxe_softc *sc); static int bxe_stop_locked(struct bxe_softc *sc); +static void bxe_sp_err_timeout_task(void *arg, int pending); +void bxe_parity_recover(struct bxe_softc *sc); +void bxe_handle_error(struct bxe_softc *sc); static __noinline int bxe_nic_load(struct bxe_softc *sc, int load_mode); static __noinline int bxe_nic_unload(struct bxe_softc *sc, @@ -3488,16 +3492,12 @@ bxe_watchdog(struct bxe_softc *sc, } BLOGE(sc, "TX watchdog timeout on fp[%02d], resetting!\n", fp->index); - if(sc->trigger_grcdump) { - /* taking grcdump */ - bxe_grc_dump(sc); - } BXE_FP_TX_UNLOCK(fp); + BXE_SET_ERROR_BIT(sc, BXE_ERR_TXQ_STUCK); + taskqueue_enqueue_timeout(taskqueue_thread, + &sc->sp_err_timeout_task, hz/10); - atomic_store_rel_long(&sc->chip_tq_flags, CHIP_TQ_REINIT); - taskqueue_enqueue(sc->chip_tq, &sc->chip_tq_task); - return (-1); } @@ -4252,6 +4252,7 @@ bxe_nic_unload(struct bxe_softc *sc, struct bxe_fastpath *fp; fp = &sc->fp[i]; + fp->watchdog_timer = 0; BXE_FP_TX_LOCK(fp); BXE_FP_TX_UNLOCK(fp); } @@ -4267,20 +4268,22 @@ bxe_nic_unload(struct bxe_softc *sc, if (IS_PF(sc) && sc->recovery_state != BXE_RECOVERY_DONE && (sc->state == BXE_STATE_CLOSED || sc->state == BXE_STATE_ERROR)) { - /* - * We can get here if the driver has been unloaded - * during parity error recovery and is either waiting for a - * leader to complete or for other functions to unload and - * then ifconfig down has been issued. In this case we want to - * unload and let other functions to complete a recovery - * process. - */ - sc->recovery_state = BXE_RECOVERY_DONE; - sc->is_leader = 0; - bxe_release_leader_lock(sc); - mb(); - BLOGD(sc, DBG_LOAD, "Releasing a leadership...\n"); + if(CHIP_PORT_MODE(sc) == CHIP_4_PORT_MODE) { + /* + * We can get here if the driver has been unloaded + * during parity error recovery and is either waiting for a + * leader to complete or for other functions to unload and + * then ifconfig down has been issued. In this case we want to + * unload and let other functions to complete a recovery + * process. + */ + sc->recovery_state = BXE_RECOVERY_DONE; + sc->is_leader = 0; + bxe_release_leader_lock(sc); + mb(); + BLOGD(sc, DBG_LOAD, "Releasing a leadership...\n"); + } BLOGE(sc, "Can't unload in closed or error state recover_state 0x%x" " state = 0x%x\n", sc->recovery_state, sc->state); return (-1); @@ -7575,6 +7578,10 @@ bxe_parity_attn(struct bxe_softc *sc, if (print) BLOGI(sc, "\n"); + if( *global == TRUE ) { + BXE_SET_ERROR_BIT(sc, BXE_ERR_GLOBAL); + } + return (TRUE); } @@ -7589,6 +7596,9 @@ bxe_chk_parity_attn(struct bxe_softc *sc, struct attn_route attn = { {0} }; int port = SC_PORT(sc); + if(sc->state != BXE_STATE_OPEN) + return FALSE; + attn.sig[0] = REG_RD(sc, MISC_REG_AEU_AFTER_INVERT_1_FUNC_0 + port*4); attn.sig[1] = REG_RD(sc, MISC_REG_AEU_AFTER_INVERT_2_FUNC_0 + port*4); attn.sig[2] = REG_RD(sc, MISC_REG_AEU_AFTER_INVERT_3_FUNC_0 + port*4); @@ -7615,10 +7625,12 @@ bxe_attn_int_deasserted4(struct bxe_softc *sc, uint32_t attn) { uint32_t val; + boolean_t err_flg = FALSE; if (attn & AEU_INPUTS_ATTN_BITS_PGLUE_HW_INTERRUPT) { val = REG_RD(sc, PGLUE_B_REG_PGLUE_B_INT_STS_CLR); BLOGE(sc, "PGLUE hw attention 0x%08x\n", val); + err_flg = TRUE; if (val & PGLUE_B_PGLUE_B_INT_STS_REG_ADDRESS_ERROR) BLOGE(sc, "PGLUE_B_PGLUE_B_INT_STS_REG_ADDRESS_ERROR\n"); if (val & PGLUE_B_PGLUE_B_INT_STS_REG_INCORRECT_RCV_BEHAVIOR) @@ -7642,6 +7654,7 @@ bxe_attn_int_deasserted4(struct bxe_softc *sc, if (attn & AEU_INPUTS_ATTN_BITS_ATC_HW_INTERRUPT) { val = REG_RD(sc, ATC_REG_ATC_INT_STS_CLR); BLOGE(sc, "ATC hw attention 0x%08x\n", val); + err_flg = TRUE; if (val & ATC_ATC_INT_STS_REG_ADDRESS_ERROR) BLOGE(sc, "ATC_ATC_INT_STS_REG_ADDRESS_ERROR\n"); if (val & ATC_ATC_INT_STS_REG_ATC_TCPL_TO_NOT_PEND) @@ -7661,7 +7674,14 @@ bxe_attn_int_deasserted4(struct bxe_softc *sc, BLOGE(sc, "FATAL parity attention set4 0x%08x\n", (uint32_t)(attn & (AEU_INPUTS_ATTN_BITS_PGLUE_PARITY_ERROR | AEU_INPUTS_ATTN_BITS_ATC_PARITY_ERROR))); + err_flg = TRUE; } + if (err_flg) { + BXE_SET_ERROR_BIT(sc, BXE_ERR_MISC); + taskqueue_enqueue_timeout(taskqueue_thread, + &sc->sp_err_timeout_task, hz/10); + } + } static void @@ -8016,14 +8036,21 @@ bxe_attn_int_deasserted3(struct bxe_softc *sc, REG_WR(sc, MISC_REG_AEU_GENERAL_ATTN_9, 0); REG_WR(sc, MISC_REG_AEU_GENERAL_ATTN_8, 0); REG_WR(sc, MISC_REG_AEU_GENERAL_ATTN_7, 0); - bxe_panic(sc, ("MC assert!\n")); - + bxe_int_disable(sc); + BXE_SET_ERROR_BIT(sc, BXE_ERR_MC_ASSERT); + taskqueue_enqueue_timeout(taskqueue_thread, + &sc->sp_err_timeout_task, hz/10); + } else if (attn & BXE_MCP_ASSERT) { BLOGE(sc, "MCP assert!\n"); REG_WR(sc, MISC_REG_AEU_GENERAL_ATTN_11, 0); - // XXX bxe_fw_dump(sc); + BXE_SET_ERROR_BIT(sc, BXE_ERR_MCP_ASSERT); + taskqueue_enqueue_timeout(taskqueue_thread, + &sc->sp_err_timeout_task, hz/10); + bxe_int_disable(sc); /*avoid repetive assert alert */ + } else { BLOGE(sc, "Unknown HW assert! (attn 0x%08x)\n", attn); } @@ -8051,6 +8078,7 @@ bxe_attn_int_deasserted2(struct bxe_softc *sc, int reg_offset; uint32_t val0, mask0, val1, mask1; uint32_t val; + boolean_t err_flg = FALSE; if (attn & AEU_INPUTS_ATTN_BITS_CFC_HW_INTERRUPT) { val = REG_RD(sc, CFC_REG_CFC_INT_STS_CLR); @@ -8058,6 +8086,7 @@ bxe_attn_int_deasserted2(struct bxe_softc *sc, /* CFC error attention */ if (val & 0x2) { BLOGE(sc, "FATAL error from CFC\n"); + err_flg = TRUE; } } @@ -8067,11 +8096,13 @@ bxe_attn_int_deasserted2(struct bxe_softc *sc, /* RQ_USDMDP_FIFO_OVERFLOW */ if (val & 0x18000) { BLOGE(sc, "FATAL error from PXP\n"); + err_flg = TRUE; } if (!CHIP_IS_E1x(sc)) { val = REG_RD(sc, PXP_REG_PXP_INT_STS_CLR_1); BLOGE(sc, "PXP hw attention-1 0x%08x\n", val); + err_flg = TRUE; } } @@ -8108,6 +8139,7 @@ bxe_attn_int_deasserted2(struct bxe_softc *sc, */ if (val0 & PXP2_EOP_ERROR_BIT) { BLOGE(sc, "PXP2_WR_PGLUE_EOP_ERROR\n"); + err_flg = TRUE; /* * if only PXP2_PXP2_INT_STS_0_REG_WR_PGLUE_EOP_ERROR is @@ -8130,8 +8162,15 @@ bxe_attn_int_deasserted2(struct bxe_softc *sc, BLOGE(sc, "FATAL HW block attention set2 0x%x\n", (uint32_t)(attn & HW_INTERRUT_ASSERT_SET_2)); + err_flg = TRUE; bxe_panic(sc, ("HW block attention set2\n")); } + if(err_flg) { + BXE_SET_ERROR_BIT(sc, BXE_ERR_GLOBAL); + taskqueue_enqueue_timeout(taskqueue_thread, + &sc->sp_err_timeout_task, hz/10); + } + } static void @@ -8141,6 +8180,7 @@ bxe_attn_int_deasserted1(struct bxe_softc *sc, int port = SC_PORT(sc); int reg_offset; uint32_t val; + boolean_t err_flg = FALSE; if (attn & AEU_INPUTS_ATTN_BITS_DOORBELLQ_HW_INTERRUPT) { val = REG_RD(sc, DORQ_REG_DORQ_INT_STS_CLR); @@ -8148,6 +8188,7 @@ bxe_attn_int_deasserted1(struct bxe_softc *sc, /* DORQ discard attention */ if (val & 0x2) { BLOGE(sc, "FATAL error from DORQ\n"); + err_flg = TRUE; } } @@ -8161,8 +8202,15 @@ bxe_attn_int_deasserted1(struct bxe_softc *sc, BLOGE(sc, "FATAL HW block attention set1 0x%08x\n", (uint32_t)(attn & HW_INTERRUT_ASSERT_SET_1)); + err_flg = TRUE; bxe_panic(sc, ("HW block attention set1\n")); } + if(err_flg) { + BXE_SET_ERROR_BIT(sc, BXE_ERR_MISC); + taskqueue_enqueue_timeout(taskqueue_thread, + &sc->sp_err_timeout_task, hz/10); + } + } static void @@ -8199,6 +8247,11 @@ bxe_attn_int_deasserted0(struct bxe_softc *sc, val &= ~(attn & HW_INTERRUT_ASSERT_SET_0); REG_WR(sc, reg_offset, val); + + BXE_SET_ERROR_BIT(sc, BXE_ERR_MISC); + taskqueue_enqueue_timeout(taskqueue_thread, + &sc->sp_err_timeout_task, hz/10); + bxe_panic(sc, ("FATAL HW block attention set0 0x%lx\n", (attn & HW_INTERRUT_ASSERT_SET_0))); } @@ -8228,10 +8281,12 @@ bxe_attn_int_deasserted(struct bxe_softc *sc, * In case of parity errors don't handle attentions so that * other function would "see" parity errors. */ - sc->recovery_state = BXE_RECOVERY_INIT; // XXX schedule a recovery task... /* disable HW interrupts */ bxe_int_disable(sc); + BXE_SET_ERROR_BIT(sc, BXE_ERR_PARITY); + taskqueue_enqueue_timeout(taskqueue_thread, + &sc->sp_err_timeout_task, hz/10); bxe_release_alr(sc); return; } @@ -12363,6 +12418,259 @@ bxe_periodic_stop(struct bxe_softc *sc) callout_drain(&sc->periodic_callout); } +void +bxe_parity_recover(struct bxe_softc *sc) +{ + uint8_t global = FALSE; + uint32_t error_recovered, error_unrecovered; + bool is_parity; + + + if ((sc->recovery_state == BXE_RECOVERY_FAILED) && + (sc->state == BXE_STATE_ERROR)) { + BLOGE(sc, "RECOVERY failed, " + "stack notified driver is NOT running! " + "Please reboot/power cycle the system.\n"); + return; + } + + while (1) { + BLOGD(sc, DBG_SP, + "%s sc=%p state=0x%x rec_state=0x%x error_status=%x\n", + __func__, sc, sc->state, sc->recovery_state, sc->error_status); + + switch(sc->recovery_state) { + + case BXE_RECOVERY_INIT: + is_parity = bxe_chk_parity_attn(sc, &global, FALSE); + + if ((CHIP_PORT_MODE(sc) == CHIP_4_PORT_MODE) || + (sc->error_status & BXE_ERR_MCP_ASSERT) || + (sc->error_status & BXE_ERR_GLOBAL)) { + + BXE_CORE_LOCK(sc); + if (sc->ifnet->if_drv_flags & IFF_DRV_RUNNING) { + bxe_periodic_stop(sc); + } + bxe_nic_unload(sc, UNLOAD_RECOVERY, false); + sc->state = BXE_STATE_ERROR; + sc->recovery_state = BXE_RECOVERY_FAILED; + BLOGE(sc, " No Recovery tried for error 0x%x" + " stack notified driver is NOT running!" + " Please reboot/power cycle the system.\n", + sc->error_status); + BXE_CORE_UNLOCK(sc); + return; + } + + + /* Try to get a LEADER_LOCK HW lock */ + if (bxe_trylock_leader_lock(sc)) { + + bxe_set_reset_in_progress(sc); + /* + * Check if there is a global attention and if + * there was a global attention, set the global + * reset bit. + */ + if (global) { + bxe_set_reset_global(sc); + } + sc->is_leader = 1; + } + + /* If interface has been removed - break */ + + if (sc->ifnet->if_drv_flags & IFF_DRV_RUNNING) { + bxe_periodic_stop(sc); + } + + BXE_CORE_LOCK(sc); + bxe_nic_unload(sc,UNLOAD_RECOVERY, false); + sc->recovery_state = BXE_RECOVERY_WAIT; + BXE_CORE_UNLOCK(sc); + + /* + * Ensure "is_leader", MCP command sequence and + * "recovery_state" update values are seen on other + * CPUs. + */ + mb(); + break; + case BXE_RECOVERY_WAIT: + + if (sc->is_leader) { + int other_engine = SC_PATH(sc) ? 0 : 1; + bool other_load_status = + bxe_get_load_status(sc, other_engine); + bool load_status = + bxe_get_load_status(sc, SC_PATH(sc)); + global = bxe_reset_is_global(sc); + + /* + * In case of a parity in a global block, let + * the first leader that performs a + * leader_reset() reset the global blocks in + * order to clear global attentions. Otherwise + * the gates will remain closed for that + * engine. + */ + if (load_status || + (global && other_load_status)) { + /* + * Wait until all other functions get + * down. + */ + taskqueue_enqueue_timeout(taskqueue_thread, + &sc->sp_err_timeout_task, hz/10); + return; + } else { + /* + * If all other functions got down + * try to bring the chip back to + * normal. In any case it's an exit + * point for a leader. + */ + if (bxe_leader_reset(sc)) { + BLOGE(sc, "RECOVERY failed, " + "stack notified driver is NOT running!\n"); + sc->recovery_state = BXE_RECOVERY_FAILED; + sc->state = BXE_STATE_ERROR; + mb(); + return; + } + + /* + * If we are here, means that the + * leader has succeeded and doesn't + * want to be a leader any more. Try + * to continue as a none-leader. + */ + break; + } + + } else { /* non-leader */ + if (!bxe_reset_is_done(sc, SC_PATH(sc))) { + /* + * Try to get a LEADER_LOCK HW lock as + * long as a former leader may have + * been unloaded by the user or + * released a leadership by another + * reason. + */ + if (bxe_trylock_leader_lock(sc)) { + /* + * I'm a leader now! Restart a + * switch case. + */ + sc->is_leader = 1; + break; + } + + taskqueue_enqueue_timeout(taskqueue_thread, + &sc->sp_err_timeout_task, hz/10); + return; + + } else { + /* + * If there was a global attention, wait + * for it to be cleared. + */ + if (bxe_reset_is_global(sc)) { + taskqueue_enqueue_timeout(taskqueue_thread, + &sc->sp_err_timeout_task, hz/10); + return; + } + + error_recovered = + sc->eth_stats.recoverable_error; + error_unrecovered = + sc->eth_stats.unrecoverable_error; + BXE_CORE_LOCK(sc); + sc->recovery_state = + BXE_RECOVERY_NIC_LOADING; + if (bxe_nic_load(sc, LOAD_NORMAL)) { + error_unrecovered++; + sc->recovery_state = BXE_RECOVERY_FAILED; + sc->state = BXE_STATE_ERROR; + BLOGE(sc, "Recovery is NOT successfull, " + " state=0x%x recovery_state=0x%x error=%x\n", + sc->state, sc->recovery_state, sc->error_status); + sc->error_status = 0; + } else { + sc->recovery_state = + BXE_RECOVERY_DONE; + error_recovered++; + BLOGI(sc, "Recovery is successfull from errors %x," + " state=0x%x" + " recovery_state=0x%x \n", sc->error_status, + sc->state, sc->recovery_state); + mb(); + } + sc->error_status = 0; + BXE_CORE_UNLOCK(sc); + sc->eth_stats.recoverable_error = + error_recovered; + sc->eth_stats.unrecoverable_error = + error_unrecovered; + + return; + } + } + default: + return; + } + } +} +void +bxe_handle_error(struct bxe_softc * sc) +{ + + if(sc->recovery_state == BXE_RECOVERY_WAIT) { + return; + } + if(sc->error_status) { + if (sc->state == BXE_STATE_OPEN) { + bxe_int_disable(sc); + } + if (sc->link_vars.link_up) { + if_link_state_change(sc->ifnet, LINK_STATE_DOWN); + } + sc->recovery_state = BXE_RECOVERY_INIT; + BLOGI(sc, "bxe%d: Recovery started errors 0x%x recovery state 0x%x\n", + sc->unit, sc->error_status, sc->recovery_state); + bxe_parity_recover(sc); + } +} + +static void +bxe_sp_err_timeout_task(void *arg, int pending) +{ + + struct bxe_softc *sc = (struct bxe_softc *)arg; + + BLOGD(sc, DBG_SP, + "%s state = 0x%x rec state=0x%x error_status=%x\n", + __func__, sc->state, sc->recovery_state, sc->error_status); + + if((sc->recovery_state == BXE_RECOVERY_FAILED) && + (sc->state == BXE_STATE_ERROR)) { + return; + } + /* if can be taken */ + if ((sc->error_status) && (sc->trigger_grcdump)) { + bxe_grc_dump(sc); + } + if (sc->recovery_state != BXE_RECOVERY_DONE) { + bxe_handle_error(sc); + bxe_parity_recover(sc); + } else if (sc->error_status) { + bxe_handle_error(sc); + } + + return; +} + /* start the controller */ static __noinline int bxe_nic_load(struct bxe_softc *sc, @@ -12644,6 +12952,15 @@ bxe_init_locked(struct bxe_softc *sc) return (0); } + if((sc->state == BXE_STATE_ERROR) && + (sc->recovery_state == BXE_RECOVERY_FAILED)) { + BLOGE(sc, "Initialization not done, " + "as previous recovery failed." + "Reboot/Power-cycle the system\n" ); + return (ENXIO); + } + + bxe_set_power_state(sc, PCI_PM_D0); /* @@ -16042,6 +16359,10 @@ bxe_attach(device_t dev) taskqueue_start_threads(&sc->chip_tq, 1, PWAIT, /* lower priority */ "%s", sc->chip_tq_name); + TIMEOUT_TASK_INIT(taskqueue_thread, + &sc->sp_err_timeout_task, 0, bxe_sp_err_timeout_task, sc); + + /* get device info and set params */ if (bxe_get_device_info(sc) != 0) { BLOGE(sc, "getting device info\n"); @@ -16217,6 +16538,8 @@ bxe_detach(device_t dev) taskqueue_drain(sc->chip_tq, &sc->chip_tq_task); taskqueue_free(sc->chip_tq); sc->chip_tq = NULL; + taskqueue_drain_timeout(taskqueue_thread, + &sc->sp_err_timeout_task); } /* stop and reset the controller if it was open */ Modified: stable/9/sys/dev/bxe/bxe.h ============================================================================== --- stable/9/sys/dev/bxe/bxe.h Mon Oct 29 21:46:05 2018 (r339886) +++ stable/9/sys/dev/bxe/bxe.h Mon Oct 29 21:46:12 2018 (r339887) @@ -474,6 +474,10 @@ struct bxe_device_type #define BXE_FW_RX_ALIGN_END (1 << BXE_RX_ALIGN_SHIFT) #define BXE_PXP_DRAM_ALIGN (BXE_RX_ALIGN_SHIFT - 5) /* XXX ??? */ +#define BXE_SET_ERROR_BIT(sc, error) \ +{ \ + (sc)->error_status |= (error); \ +} struct bxe_bar { struct resource *resource; @@ -1387,6 +1391,8 @@ struct bxe_softc { struct taskqueue *chip_tq; char chip_tq_name[32]; + struct timeout_task sp_err_timeout_task; + /* slowpath interrupt taskqueue */ struct task sp_tq_task; struct taskqueue *sp_tq; @@ -1540,6 +1546,16 @@ struct bxe_softc { #define BXE_RECOVERY_WAIT 3 #define BXE_RECOVERY_FAILED 4 #define BXE_RECOVERY_NIC_LOADING 5 + +#define BXE_ERR_TXQ_STUCK 0x1 /* Tx queue stuck detected by driver. */ +#define BXE_ERR_MISC 0x2 /* MISC ERR */ +#define BXE_ERR_PARITY 0x4 /* Parity error detected. */ +#define BXE_ERR_STATS_TO 0x8 /* Statistics timeout detected. */ +#define BXE_ERR_MC_ASSERT 0x10 /* MC assert attention received. */ +#define BXE_ERR_PANIC 0x20 /* Driver asserted. */ +#define BXE_ERR_MCP_ASSERT 0x40 /* MCP assert attention received. No Recovery*/ +#define BXE_ERR_GLOBAL 0x80 /* PCIe/PXP/IGU/MISC/NIG device blocks error- needs PCIe/Fundamental reset */ + uint32_t error_status; uint32_t rx_mode; #define BXE_RX_MODE_NONE 0 Modified: stable/9/sys/dev/bxe/bxe_stats.c ============================================================================== --- stable/9/sys/dev/bxe/bxe_stats.c Mon Oct 29 21:46:05 2018 (r339886) +++ stable/9/sys/dev/bxe/bxe_stats.c Mon Oct 29 21:46:12 2018 (r339887) @@ -36,7 +36,6 @@ __FBSDID("$FreeBSD$"); #define BITS_PER_LONG 64 #endif -extern int bxe_grc_dump(struct bxe_softc *sc); static inline long bxe_hilo(uint32_t *hiref) @@ -236,11 +235,11 @@ bxe_stats_comp(struct bxe_softc *sc) while (*stats_comp != DMAE_COMP_VAL) { if (!cnt) { BLOGE(sc, "Timeout waiting for stats finished\n"); - if(sc->trigger_grcdump) { - /* taking grcdump */ - bxe_grc_dump(sc); - } + BXE_SET_ERROR_BIT(sc, BXE_ERR_STATS_TO); + taskqueue_enqueue_timeout(taskqueue_thread, + &sc->sp_err_timeout_task, hz/10); break; + } cnt--; @@ -923,6 +922,7 @@ bxe_hw_stats_update(struct bxe_softc *sc) nig_timer_max = SHMEM_RD(sc, port_mb[SC_PORT(sc)].stat_nig_timer); if (nig_timer_max != estats->nig_timer_max) { estats->nig_timer_max = nig_timer_max; + /*NOTE: not setting error bit */ BLOGE(sc, "invalid NIG timer max (%u)\n", estats->nig_timer_max); } @@ -1316,12 +1316,10 @@ bxe_stats_update(struct bxe_softc *sc) if (bxe_storm_stats_update(sc)) { if (sc->stats_pending++ == 3) { if (sc->ifnet->if_drv_flags & IFF_DRV_RUNNING) { - if(sc->trigger_grcdump) { - /* taking grcdump */ - bxe_grc_dump(sc); - } - atomic_store_rel_long(&sc->chip_tq_flags, CHIP_TQ_REINIT); - taskqueue_enqueue(sc->chip_tq, &sc->chip_tq_task); + BLOGE(sc, "Storm stats not updated for 3 times, resetting\n"); + BXE_SET_ERROR_BIT(sc, BXE_ERR_STATS_TO); + taskqueue_enqueue_timeout(taskqueue_thread, + &sc->sp_err_timeout_task, hz/10); } } return;