Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 6 Mar 2018 23:12:32 +0000 (UTC)
From:      David C Somayajulu <davidcs@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-11@freebsd.org
Subject:   svn commit: r330555 - stable/11/sys/dev/qlxgbe
Message-ID:  <201803062312.w26NCW4n088637@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: davidcs
Date: Tue Mar  6 23:12:32 2018
New Revision: 330555
URL: https://svnweb.freebsd.org/changeset/base/330555

Log:
  MFC r329855
    1. Added support to offline a port if is error recovery on successful.
    2. Sysctls to enable/disable driver_state_dump and error_recovery.
    3. Sysctl to control the delay between hw/fw reinitialization and
       restarting the fastpath.
    4. Stop periodic stats retrieval if interface has IFF_DRV_RUNNING flag off.
    5. Print contents of PEG_HALT_STATUS1 and PEG_HALT_STATUS2 on heartbeat
       failure.
    6. Speed up slowpath shutdown during error recovery.
    7. link_state update using atomic_store.
    8. Added timestamp information on driver state and minidump captures.
    9. Added support for Slowpath event logging
    10.Added additional failure injection types to simulate failures.

Modified:
  stable/11/sys/dev/qlxgbe/ql_dbg.h
  stable/11/sys/dev/qlxgbe/ql_def.h
  stable/11/sys/dev/qlxgbe/ql_glbl.h
  stable/11/sys/dev/qlxgbe/ql_hw.c
  stable/11/sys/dev/qlxgbe/ql_hw.h
  stable/11/sys/dev/qlxgbe/ql_inline.h
  stable/11/sys/dev/qlxgbe/ql_ioctl.c
  stable/11/sys/dev/qlxgbe/ql_ioctl.h
  stable/11/sys/dev/qlxgbe/ql_isr.c
  stable/11/sys/dev/qlxgbe/ql_misc.c
  stable/11/sys/dev/qlxgbe/ql_os.c
  stable/11/sys/dev/qlxgbe/ql_os.h
  stable/11/sys/dev/qlxgbe/ql_ver.h
Directory Properties:
  stable/11/   (props changed)

Modified: stable/11/sys/dev/qlxgbe/ql_dbg.h
==============================================================================
--- stable/11/sys/dev/qlxgbe/ql_dbg.h	Tue Mar  6 22:45:45 2018	(r330554)
+++ stable/11/sys/dev/qlxgbe/ql_dbg.h	Tue Mar  6 23:12:32 2018	(r330555)
@@ -42,17 +42,21 @@ extern void ql_dump_buf16(qla_host_t *ha, const char *
 extern void ql_dump_buf32(qla_host_t *ha, const char *str, void *dbuf,
 		uint32_t len32);
 
-#define INJCT_RX_RXB_INVAL		0x00001
-#define INJCT_RX_MP_NULL		0x00002
-#define INJCT_LRO_RXB_INVAL		0x00003
-#define INJCT_LRO_MP_NULL		0x00004
-#define INJCT_NUM_HNDLE_INVALID		0x00005
-#define INJCT_RDWR_INDREG_FAILURE	0x00006
-#define INJCT_RDWR_OFFCHIPMEM_FAILURE	0x00007
-#define INJCT_MBX_CMD_FAILURE		0x00008
-#define INJCT_HEARTBEAT_FAILURE		0x00009
-#define INJCT_TEMPERATURE_FAILURE	0x0000A
-#define INJCT_M_GETCL_M_GETJCL_FAILURE	0x0000B
+#define INJCT_RX_RXB_INVAL				0x00001
+#define INJCT_RX_MP_NULL				0x00002
+#define INJCT_LRO_RXB_INVAL				0x00003
+#define INJCT_LRO_MP_NULL				0x00004
+#define INJCT_NUM_HNDLE_INVALID				0x00005
+#define INJCT_RDWR_INDREG_FAILURE			0x00006
+#define INJCT_RDWR_OFFCHIPMEM_FAILURE			0x00007
+#define INJCT_MBX_CMD_FAILURE				0x00008
+#define INJCT_HEARTBEAT_FAILURE				0x00009
+#define INJCT_TEMPERATURE_FAILURE			0x0000A
+#define INJCT_M_GETCL_M_GETJCL_FAILURE			0x0000B
+#define INJCT_INV_CONT_OPCODE				0x0000C
+#define INJCT_SGL_RCV_INV_DESC_COUNT			0x0000D
+#define INJCT_SGL_LRO_INV_DESC_COUNT			0x0000E
+#define INJCT_PEER_PORT_FAILURE_ERR_RECOVERY		0x0000F
 
 #ifdef QL_DBG
 

Modified: stable/11/sys/dev/qlxgbe/ql_def.h
==============================================================================
--- stable/11/sys/dev/qlxgbe/ql_def.h	Tue Mar  6 22:45:45 2018	(r330554)
+++ stable/11/sys/dev/qlxgbe/ql_def.h	Tue Mar  6 23:12:32 2018	(r330555)
@@ -144,12 +144,12 @@ struct qla_host {
 	volatile uint32_t	qla_watchdog_paused;
 	volatile uint32_t	qla_initiate_recovery;
 	volatile uint32_t	qla_detach_active;
+	volatile uint32_t	offline;
 
 	device_t		pci_dev;
 
-	uint16_t		watchdog_ticks;
+	volatile uint16_t	watchdog_ticks;
 	uint8_t			pci_func;
-	uint8_t			resvd;
 
         /* ioctl related */
         struct cdev             *ioctl_dev;
@@ -182,6 +182,7 @@ struct qla_host {
 
 	/* hardware access lock */
 
+	struct mtx		sp_log_lock;
 	struct mtx		hw_lock;
 	volatile uint32_t	hw_lock_held;
 	uint64_t		hw_lock_failed;
@@ -239,6 +240,9 @@ struct qla_host {
 	volatile const char	*qla_unlock;
 	uint32_t		dbg_level;
 	uint32_t		enable_minidump;
+	uint32_t		enable_driverstate_dump;
+	uint32_t		enable_error_recovery;
+	uint32_t		ms_delay_after_init;
 
 	uint8_t			fw_ver_str[32];
 
@@ -272,5 +276,7 @@ typedef struct qla_host qla_host_t;
 #define QL_MAC_CMP(mac1, mac2)    \
 	((((*(uint32_t *) mac1) == (*(uint32_t *) mac2) && \
 	(*(uint16_t *)(mac1 + 4)) == (*(uint16_t *)(mac2 + 4)))) ? 0 : 1)
+
+#define QL_INITIATE_RECOVERY(ha) qla_set_error_recovery(ha)
 
 #endif /* #ifndef _QL_DEF_H_ */

Modified: stable/11/sys/dev/qlxgbe/ql_glbl.h
==============================================================================
--- stable/11/sys/dev/qlxgbe/ql_glbl.h	Tue Mar  6 22:45:45 2018	(r330554)
+++ stable/11/sys/dev/qlxgbe/ql_glbl.h	Tue Mar  6 23:12:32 2018	(r330555)
@@ -47,6 +47,7 @@ extern uint32_t ql_rcv_isr(qla_host_t *ha, uint32_t sd
 extern int ql_alloc_dmabuf(qla_host_t *ha, qla_dma_t *dma_buf);
 extern void ql_free_dmabuf(qla_host_t *ha, qla_dma_t *dma_buf);
 extern int ql_get_mbuf(qla_host_t *ha, qla_rx_buf_t *rxb, struct mbuf *nmp);
+extern void qla_set_error_recovery(qla_host_t *ha);
 
 /*
  * from ql_hw.c
@@ -115,5 +116,11 @@ extern unsigned int ql83xx_minidump_len;
 extern void ql_alloc_drvr_state_buffer(qla_host_t *ha);
 extern void ql_free_drvr_state_buffer(qla_host_t *ha);
 extern void ql_capture_drvr_state(qla_host_t *ha);
+extern void ql_sp_log(qla_host_t *ha, uint16_t fmtstr_idx, uint16_t num_params,
+		uint32_t param0, uint32_t param1, uint32_t param2,
+		uint32_t param3, uint32_t param4);
+extern void ql_alloc_sp_log_buffer(qla_host_t *ha);
+extern void ql_free_sp_log_buffer(qla_host_t *ha);
+
 
 #endif /* #ifndef_QL_GLBL_H_ */

Modified: stable/11/sys/dev/qlxgbe/ql_hw.c
==============================================================================
--- stable/11/sys/dev/qlxgbe/ql_hw.c	Tue Mar  6 22:45:45 2018	(r330554)
+++ stable/11/sys/dev/qlxgbe/ql_hw.c	Tue Mar  6 23:12:32 2018	(r330555)
@@ -49,7 +49,7 @@ __FBSDID("$FreeBSD$");
 
 static void qla_del_rcv_cntxt(qla_host_t *ha);
 static int qla_init_rcv_cntxt(qla_host_t *ha);
-static void qla_del_xmt_cntxt(qla_host_t *ha);
+static int qla_del_xmt_cntxt(qla_host_t *ha);
 static int qla_init_xmt_cntxt(qla_host_t *ha);
 static int qla_mbx_cmd(qla_host_t *ha, uint32_t *h_mbox, uint32_t n_hmbox,
 	uint32_t *fw_mbox, uint32_t n_fwmbox, uint32_t no_pause);
@@ -647,11 +647,118 @@ qlnx_add_hw_xmt_stats_sysctls(qla_host_t *ha)
 }
 
 static void
+qlnx_add_hw_mbx_cmpl_stats_sysctls(qla_host_t *ha)
+{
+        struct sysctl_ctx_list  *ctx;
+        struct sysctl_oid_list  *node_children;
+
+        ctx = device_get_sysctl_ctx(ha->pci_dev);
+        node_children = SYSCTL_CHILDREN(device_get_sysctl_tree(ha->pci_dev));
+
+	SYSCTL_ADD_QUAD(ctx, node_children,
+		OID_AUTO, "mbx_completion_time_lt_200ms",
+		CTLFLAG_RD, &ha->hw.mbx_comp_msecs[0],
+		"mbx_completion_time_lt_200ms");
+
+	SYSCTL_ADD_QUAD(ctx, node_children,
+		OID_AUTO, "mbx_completion_time_200ms_400ms",
+		CTLFLAG_RD, &ha->hw.mbx_comp_msecs[1],
+		"mbx_completion_time_200ms_400ms");
+
+	SYSCTL_ADD_QUAD(ctx, node_children,
+		OID_AUTO, "mbx_completion_time_400ms_600ms",
+		CTLFLAG_RD, &ha->hw.mbx_comp_msecs[2],
+		"mbx_completion_time_400ms_600ms");
+
+	SYSCTL_ADD_QUAD(ctx, node_children,
+		OID_AUTO, "mbx_completion_time_600ms_800ms",
+		CTLFLAG_RD, &ha->hw.mbx_comp_msecs[3],
+		"mbx_completion_time_600ms_800ms");
+
+	SYSCTL_ADD_QUAD(ctx, node_children,
+		OID_AUTO, "mbx_completion_time_800ms_1000ms",
+		CTLFLAG_RD, &ha->hw.mbx_comp_msecs[4],
+		"mbx_completion_time_800ms_1000ms");
+
+	SYSCTL_ADD_QUAD(ctx, node_children,
+		OID_AUTO, "mbx_completion_time_1000ms_1200ms",
+		CTLFLAG_RD, &ha->hw.mbx_comp_msecs[5],
+		"mbx_completion_time_1000ms_1200ms");
+
+	SYSCTL_ADD_QUAD(ctx, node_children,
+		OID_AUTO, "mbx_completion_time_1200ms_1400ms",
+		CTLFLAG_RD, &ha->hw.mbx_comp_msecs[6],
+		"mbx_completion_time_1200ms_1400ms");
+
+	SYSCTL_ADD_QUAD(ctx, node_children,
+		OID_AUTO, "mbx_completion_time_1400ms_1600ms",
+		CTLFLAG_RD, &ha->hw.mbx_comp_msecs[7],
+		"mbx_completion_time_1400ms_1600ms");
+
+	SYSCTL_ADD_QUAD(ctx, node_children,
+		OID_AUTO, "mbx_completion_time_1600ms_1800ms",
+		CTLFLAG_RD, &ha->hw.mbx_comp_msecs[8],
+		"mbx_completion_time_1600ms_1800ms");
+
+	SYSCTL_ADD_QUAD(ctx, node_children,
+		OID_AUTO, "mbx_completion_time_1800ms_2000ms",
+		CTLFLAG_RD, &ha->hw.mbx_comp_msecs[9],
+		"mbx_completion_time_1800ms_2000ms");
+
+	SYSCTL_ADD_QUAD(ctx, node_children,
+		OID_AUTO, "mbx_completion_time_2000ms_2200ms",
+		CTLFLAG_RD, &ha->hw.mbx_comp_msecs[10],
+		"mbx_completion_time_2000ms_2200ms");
+
+	SYSCTL_ADD_QUAD(ctx, node_children,
+		OID_AUTO, "mbx_completion_time_2200ms_2400ms",
+		CTLFLAG_RD, &ha->hw.mbx_comp_msecs[11],
+		"mbx_completion_time_2200ms_2400ms");
+
+	SYSCTL_ADD_QUAD(ctx, node_children,
+		OID_AUTO, "mbx_completion_time_2400ms_2600ms",
+		CTLFLAG_RD, &ha->hw.mbx_comp_msecs[12],
+		"mbx_completion_time_2400ms_2600ms");
+
+	SYSCTL_ADD_QUAD(ctx, node_children,
+		OID_AUTO, "mbx_completion_time_2600ms_2800ms",
+		CTLFLAG_RD, &ha->hw.mbx_comp_msecs[13],
+		"mbx_completion_time_2600ms_2800ms");
+
+	SYSCTL_ADD_QUAD(ctx, node_children,
+		OID_AUTO, "mbx_completion_time_2800ms_3000ms",
+		CTLFLAG_RD, &ha->hw.mbx_comp_msecs[14],
+		"mbx_completion_time_2800ms_3000ms");
+
+	SYSCTL_ADD_QUAD(ctx, node_children,
+		OID_AUTO, "mbx_completion_time_3000ms_4000ms",
+		CTLFLAG_RD, &ha->hw.mbx_comp_msecs[15],
+		"mbx_completion_time_3000ms_4000ms");
+
+	SYSCTL_ADD_QUAD(ctx, node_children,
+		OID_AUTO, "mbx_completion_time_4000ms_5000ms",
+		CTLFLAG_RD, &ha->hw.mbx_comp_msecs[16],
+		"mbx_completion_time_4000ms_5000ms");
+
+	SYSCTL_ADD_QUAD(ctx, node_children,
+		OID_AUTO, "mbx_completion_host_mbx_cntrl_timeout",
+		CTLFLAG_RD, &ha->hw.mbx_comp_msecs[17],
+		"mbx_completion_host_mbx_cntrl_timeout");
+
+	SYSCTL_ADD_QUAD(ctx, node_children,
+		OID_AUTO, "mbx_completion_fw_mbx_cntrl_timeout",
+		CTLFLAG_RD, &ha->hw.mbx_comp_msecs[18],
+		"mbx_completion_fw_mbx_cntrl_timeout");
+	return;
+}
+
+static void
 qlnx_add_hw_stats_sysctls(qla_host_t *ha)
 {
 	qlnx_add_hw_mac_stats_sysctls(ha);
 	qlnx_add_hw_rcv_stats_sysctls(ha);
 	qlnx_add_hw_xmt_stats_sysctls(ha);
+	qlnx_add_hw_mbx_cmpl_stats_sysctls(ha);
 
 	return;
 }
@@ -918,6 +1025,30 @@ ql_hw_add_sysctls(qla_host_t *ha)
 		"\t Any change requires ifconfig down/up to take effect\n"
 		"\t Note that LRO may be turned off/on via ifconfig\n");
 
+        SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev),
+                SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
+                OID_AUTO, "sp_log_index", CTLFLAG_RW, &ha->hw.sp_log_index,
+                ha->hw.sp_log_index, "sp_log_index");
+
+        SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev),
+                SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
+                OID_AUTO, "sp_log_stop", CTLFLAG_RW, &ha->hw.sp_log_stop,
+                ha->hw.sp_log_stop, "sp_log_stop");
+
+        ha->hw.sp_log_stop_events = 0;
+
+        SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev),
+                SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
+                OID_AUTO, "sp_log_stop_events", CTLFLAG_RW,
+		&ha->hw.sp_log_stop_events,
+                ha->hw.sp_log_stop_events, "Slow path event log is stopped"
+		" when OR of the following events occur \n"
+		"\t 0x01 : Heart beat Failure\n"
+		"\t 0x02 : Temperature Failure\n"
+		"\t 0x04 : HW Initialization Failure\n"
+		"\t 0x08 : Interface Initialization Failure\n"
+		"\t 0x10 : Error Recovery Failure\n");
+
 	ha->hw.mdump_active = 0;
         SYSCTL_ADD_UINT(device_get_sysctl_ctx(dev),
                 SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
@@ -998,13 +1129,13 @@ ql_hw_link_status(qla_host_t *ha)
 		device_printf(ha->pci_dev, "link Down\n");
 	}
 
-	if (ha->hw.flags.fduplex) {
+	if (ha->hw.fduplex) {
 		device_printf(ha->pci_dev, "Full Duplex\n");
 	} else {
 		device_printf(ha->pci_dev, "Half Duplex\n");
 	}
 
-	if (ha->hw.flags.autoneg) {
+	if (ha->hw.autoneg) {
 		device_printf(ha->pci_dev, "Auto Negotiation Enabled\n");
 	} else {
 		device_printf(ha->pci_dev, "Auto Negotiation Disabled\n");
@@ -1255,19 +1386,39 @@ qla_mbx_cmd(qla_host_t *ha, uint32_t *h_mbox, uint32_t
 	uint32_t i;
 	uint32_t data;
 	int ret = 0;
+	uint64_t start_usecs;
+	uint64_t end_usecs;
+	uint64_t msecs_200;
 
-	if (QL_ERR_INJECT(ha, INJCT_MBX_CMD_FAILURE)) {
+	ql_sp_log(ha, 0, 5, no_pause, h_mbox[0], h_mbox[1], h_mbox[2], h_mbox[3]);
+
+	if (ha->offline || ha->qla_initiate_recovery) {
+		ql_sp_log(ha, 1, 2, ha->offline, ha->qla_initiate_recovery, 0, 0, 0);
+		goto exit_qla_mbx_cmd;
+	}
+
+	if (((ha->err_inject & 0xFFFF) == INJCT_MBX_CMD_FAILURE) &&
+		(((ha->err_inject & ~0xFFFF) == ((h_mbox[0] & 0xFFFF) << 16))||
+		!(ha->err_inject & ~0xFFFF))) {
 		ret = -3;
-		ha->qla_initiate_recovery = 1;
+		QL_INITIATE_RECOVERY(ha);
 		goto exit_qla_mbx_cmd;
 	}
 
+	start_usecs = qla_get_usec_timestamp();
+
 	if (no_pause)
 		i = 1000;
 	else
 		i = Q8_MBX_MSEC_DELAY;
 
 	while (i) {
+
+		if (ha->qla_initiate_recovery) {
+			ql_sp_log(ha, 2, 1, ha->qla_initiate_recovery, 0, 0, 0, 0);
+			return (-1);
+		}
+
 		data = READ_REG32(ha, Q8_HOST_MBOX_CNTRL);
 		if (data == 0)
 			break;
@@ -1282,8 +1433,10 @@ qla_mbx_cmd(qla_host_t *ha, uint32_t *h_mbox, uint32_t
 	if (i == 0) {
 		device_printf(ha->pci_dev, "%s: host_mbx_cntrl 0x%08x\n",
 			__func__, data);
+		ql_sp_log(ha, 3, 1, data, 0, 0, 0, 0);
 		ret = -1;
-		ha->qla_initiate_recovery = 1;
+		ha->hw.mbx_comp_msecs[(Q8_MBX_COMP_MSECS - 2)]++;
+		QL_INITIATE_RECOVERY(ha);
 		goto exit_qla_mbx_cmd;
 	}
 
@@ -1297,6 +1450,12 @@ qla_mbx_cmd(qla_host_t *ha, uint32_t *h_mbox, uint32_t
 
 	i = Q8_MBX_MSEC_DELAY;
 	while (i) {
+
+		if (ha->qla_initiate_recovery) {
+			ql_sp_log(ha, 4, 1, ha->qla_initiate_recovery, 0, 0, 0, 0);
+			return (-1);
+		}
+
 		data = READ_REG32(ha, Q8_FW_MBOX_CNTRL);
 
 		if ((data & 0x3) == 1) {
@@ -1314,18 +1473,44 @@ qla_mbx_cmd(qla_host_t *ha, uint32_t *h_mbox, uint32_t
 	if (i == 0) {
 		device_printf(ha->pci_dev, "%s: fw_mbx_cntrl 0x%08x\n",
 			__func__, data);
+		ql_sp_log(ha, 5, 1, data, 0, 0, 0, 0);
 		ret = -2;
-		ha->qla_initiate_recovery = 1;
+		ha->hw.mbx_comp_msecs[(Q8_MBX_COMP_MSECS - 1)]++;
+		QL_INITIATE_RECOVERY(ha);
 		goto exit_qla_mbx_cmd;
 	}
 
 	for (i = 0; i < n_fwmbox; i++) {
+
+		if (ha->qla_initiate_recovery) {
+			ql_sp_log(ha, 6, 1, ha->qla_initiate_recovery, 0, 0, 0, 0);
+			return (-1);
+		}
+
 		*fw_mbox++ = READ_REG32(ha, (Q8_FW_MBOX0 + (i << 2)));
 	}
 
 	WRITE_REG32(ha, Q8_FW_MBOX_CNTRL, 0x0);
 	WRITE_REG32(ha, ha->hw.mbx_intr_mask_offset, 0x0);
 
+	end_usecs = qla_get_usec_timestamp();
+
+	if (end_usecs > start_usecs) {
+		msecs_200 = (end_usecs - start_usecs)/(1000 * 200);
+
+		if (msecs_200 < 15) 
+			ha->hw.mbx_comp_msecs[msecs_200]++;
+		else if (msecs_200 < 20)
+			ha->hw.mbx_comp_msecs[15]++;
+		else {
+			device_printf(ha->pci_dev, "%s: [%ld, %ld] %ld\n", __func__,
+				start_usecs, end_usecs, msecs_200);
+			ha->hw.mbx_comp_msecs[16]++;
+		}
+	}
+	ql_sp_log(ha, 7, 5, fw_mbox[0], fw_mbox[1], fw_mbox[2], fw_mbox[3], fw_mbox[4]);
+
+
 exit_qla_mbx_cmd:
 	return (ret);
 }
@@ -1401,7 +1586,8 @@ qla_config_intr_cntxt(qla_host_t *ha, uint32_t start_i
 	if (qla_mbx_cmd(ha, (uint32_t *)c_intr,
 		(sizeof (q80_config_intr_t) >> 2),
 		ha->hw.mbox, (sizeof (q80_config_intr_rsp_t) >> 2), 0)) {
-		device_printf(dev, "%s: failed0\n", __func__);
+		device_printf(dev, "%s: %s failed0\n", __func__,
+			(create ? "create" : "delete"));
 		return (-1);
 	}
 
@@ -1410,8 +1596,8 @@ qla_config_intr_cntxt(qla_host_t *ha, uint32_t start_i
 	err = Q8_MBX_RSP_STATUS(c_intr_rsp->regcnt_status);
 
 	if (err) {
-		device_printf(dev, "%s: failed1 [0x%08x, %d]\n", __func__, err,
-			c_intr_rsp->nentries);
+		device_printf(dev, "%s: %s failed1 [0x%08x, %d]\n", __func__,
+			(create ? "create" : "delete"), err, c_intr_rsp->nentries);
 
 		for (i = 0; i < c_intr_rsp->nentries; i++) {
 			device_printf(dev, "%s: [%d]:[0x%x 0x%x 0x%x]\n",
@@ -2015,7 +2201,8 @@ ql_get_stats(qla_host_t *ha)
 
 	cmd |= ((ha->pci_func & 0x1) << 16);
 
-	if (ha->qla_watchdog_pause)
+	if (ha->qla_watchdog_pause || (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) ||
+		ha->offline)
 		goto ql_get_stats_exit;
 
 	if (qla_get_hw_stats(ha, cmd, sizeof (q80_get_stats_rsp_t)) == 0) {
@@ -2032,7 +2219,8 @@ ql_get_stats(qla_host_t *ha)
 //	cmd |= Q8_GET_STATS_CMD_CLEAR;
 	cmd |= (ha->hw.rcv_cntxt_id << 16);
 
-	if (ha->qla_watchdog_pause)
+	if (ha->qla_watchdog_pause || (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) ||
+		ha->offline)
 		goto ql_get_stats_exit;
 
 	if (qla_get_hw_stats(ha, cmd, sizeof (q80_get_stats_rsp_t)) == 0) {
@@ -2043,13 +2231,18 @@ ql_get_stats(qla_host_t *ha)
 			__func__, ha->hw.mbox[0]);
 	}
 
-	if (ha->qla_watchdog_pause)
+	if (ha->qla_watchdog_pause || (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) ||
+		ha->offline)
 		goto ql_get_stats_exit;
 	/*
 	 * Get XMT Statistics
 	 */
-	for (i = 0 ; ((i < ha->hw.num_tx_rings) && (!ha->qla_watchdog_pause));
-		i++) {
+	for (i = 0 ; (i < ha->hw.num_tx_rings); i++) {
+		if (ha->qla_watchdog_pause ||
+			(!(ifp->if_drv_flags & IFF_DRV_RUNNING)) ||
+			ha->offline)
+			goto ql_get_stats_exit;
+
 		cmd = Q8_GET_STATS_CMD_XMT | Q8_GET_STATS_CMD_TYPE_CNTXT;
 //		cmd |= Q8_GET_STATS_CMD_CLEAR;
 		cmd |= (ha->hw.tx_cntxt[i].tx_cntxt_id << 16);
@@ -2679,7 +2872,8 @@ ql_del_hw_if(qla_host_t *ha)
 
 	qla_del_rcv_cntxt(ha);
 
-	qla_del_xmt_cntxt(ha);
+	if(qla_del_xmt_cntxt(ha))
+		goto ql_del_hw_if_exit;
 
 	if (ha->hw.flags.init_intr_cnxt) {
 		for (i = 0; i < ha->hw.num_sds_rings; ) {
@@ -2688,14 +2882,17 @@ ql_del_hw_if(qla_host_t *ha)
 				num_msix = Q8_MAX_INTR_VECTORS;
 			else
 				num_msix = ha->hw.num_sds_rings - i;
-			qla_config_intr_cntxt(ha, i, num_msix, 0);
 
+			if (qla_config_intr_cntxt(ha, i, num_msix, 0))
+				break;
+
 			i += num_msix;
 		}
 
 		ha->hw.flags.init_intr_cnxt = 0;
 	}
 
+ql_del_hw_if_exit:
 	if (ha->hw.enable_soft_lro) {
 		qla_drain_soft_lro(ha);
 		qla_free_soft_lro(ha);
@@ -3328,19 +3525,22 @@ qla_del_xmt_cntxt_i(qla_host_t *ha, uint32_t txr_idx)
 
 	return (0);
 }
-static void
+static int
 qla_del_xmt_cntxt(qla_host_t *ha)
 {
 	uint32_t i;
+	int ret = 0;
 
 	if (!ha->hw.flags.init_tx_cnxt)
-		return;
+		return (ret);
 
 	for (i = 0; i < ha->hw.num_tx_rings; i++) {
-		if (qla_del_xmt_cntxt_i(ha, i))
+		if ((ret = qla_del_xmt_cntxt_i(ha, i)) != 0)
 			break;
 	}
 	ha->hw.flags.init_tx_cnxt = 0;
+
+	return (ret);
 }
 
 static int
@@ -3350,8 +3550,10 @@ qla_init_xmt_cntxt(qla_host_t *ha)
 
 	for (i = 0; i < ha->hw.num_tx_rings; i++) {
 		if (qla_init_xmt_cntxt_i(ha, i) != 0) {
-			for (j = 0; j < i; j++)
-				qla_del_xmt_cntxt_i(ha, j);
+			for (j = 0; j < i; j++) {
+				if (qla_del_xmt_cntxt_i(ha, j))
+					break;
+			}
 			return (-1);
 		}
 	}
@@ -3627,22 +3829,23 @@ ql_hw_tx_done_locked(qla_host_t *ha, uint32_t txr_idx)
 void
 ql_update_link_state(qla_host_t *ha)
 {
-	uint32_t link_state;
+	uint32_t link_state = 0;
 	uint32_t prev_link_state;
 
-	if (!(ha->ifp->if_drv_flags & IFF_DRV_RUNNING)) {
-		ha->hw.link_up = 0;
-		return;
-	}
-	link_state = READ_REG32(ha, Q8_LINK_STATE);
-
 	prev_link_state =  ha->hw.link_up;
 
-	if (ha->pci_func == 0) 
-		ha->hw.link_up = (((link_state & 0xF) == 1)? 1 : 0);
-	else
-		ha->hw.link_up = ((((link_state >> 4)& 0xF) == 1)? 1 : 0);
+	if (ha->ifp->if_drv_flags & IFF_DRV_RUNNING) {
+		link_state = READ_REG32(ha, Q8_LINK_STATE);
 
+		if (ha->pci_func == 0) {
+			link_state = (((link_state & 0xF) == 1)? 1 : 0);
+		} else {
+			link_state = ((((link_state >> 4)& 0xF) == 1)? 1 : 0);
+		}
+	}
+
+	atomic_store_rel_8(&ha->hw.link_up, (uint8_t)link_state);
+
 	if (prev_link_state !=  ha->hw.link_up) {
 		if (ha->hw.link_up) {
 			if_link_state_change(ha->ifp, LINK_STATE_UP);
@@ -3669,8 +3872,14 @@ ql_hw_check_health(qla_host_t *ha)
 
 	if (((val & 0xFFFF) == 2) || ((val & 0xFFFF) == 3) ||
 		(QL_ERR_INJECT(ha, INJCT_TEMPERATURE_FAILURE))) {
-		device_printf(ha->pci_dev, "%s: Temperature Alert [0x%08x]\n",
-			__func__, val);
+		device_printf(ha->pci_dev, "%s: Temperature Alert"
+			" at ts_usecs %ld ts_reg = 0x%08x\n",
+			__func__, qla_get_usec_timestamp(), val);
+
+		if (ha->hw.sp_log_stop_events & Q8_SP_LOG_STOP_TEMP_FAILURE)
+			ha->hw.sp_log_stop = -1;
+
+		QL_INITIATE_RECOVERY(ha);
 		return -1;
 	}
 
@@ -3691,10 +3900,26 @@ ql_hw_check_health(qla_host_t *ha)
 			__func__, val);
 	if (ha->hw.hbeat_failure < 2) /* we ignore the first failure */
 		return 0;
-	else 
-		device_printf(ha->pci_dev, "%s: Heartbeat Failue [0x%08x]\n",
-			__func__, val);
+	else {
+		uint32_t peg_halt_status1;
+		uint32_t peg_halt_status2;
 
+		peg_halt_status1 = READ_REG32(ha, Q8_PEG_HALT_STATUS1);
+		peg_halt_status2 = READ_REG32(ha, Q8_PEG_HALT_STATUS2);
+
+		device_printf(ha->pci_dev,
+			"%s: Heartbeat Failue at ts_usecs = %ld "
+			"fw_heart_beat = 0x%08x "
+			"peg_halt_status1 = 0x%08x "
+			"peg_halt_status2 = 0x%08x\n",
+			__func__, qla_get_usec_timestamp(), val,
+			peg_halt_status1, peg_halt_status2);
+
+		if (ha->hw.sp_log_stop_events & Q8_SP_LOG_STOP_HBEAT_FAILURE)
+			ha->hw.sp_log_stop = -1;
+	}
+	QL_INITIATE_RECOVERY(ha);
+
 	return -1;
 }
 
@@ -4429,8 +4654,8 @@ ql_minidump(qla_host_t *ha)
 
 	if (ha->hw.mdump_done)
 		return;
-
-		ha->hw.mdump_start_seq_index = ql_stop_sequence(ha);
+	ha->hw.mdump_usec_ts = qla_get_usec_timestamp();
+	ha->hw.mdump_start_seq_index = ql_stop_sequence(ha);
 
 	bzero(ha->hw.mdump_buffer, ha->hw.mdump_buffer_size);
 	bzero(ha->hw.mdump_template, ha->hw.mdump_template_size);

Modified: stable/11/sys/dev/qlxgbe/ql_hw.h
==============================================================================
--- stable/11/sys/dev/qlxgbe/ql_hw.h	Tue Mar  6 22:45:45 2018	(r330554)
+++ stable/11/sys/dev/qlxgbe/ql_hw.h	Tue Mar  6 23:12:32 2018	(r330555)
@@ -1600,26 +1600,26 @@ typedef struct _qla_hw {
 		uint32_t
 			unicast_mac	:1,
 			bcast_mac	:1,
-			loopback_mode	:2,
 			init_tx_cnxt	:1,
 			init_rx_cnxt	:1,
 			init_intr_cnxt	:1,
-			fduplex		:1,
-			autoneg		:1,
 			fdt_valid	:1;
 	} flags;
 
 
-	uint16_t	link_speed;
-	uint16_t	cable_length;
-	uint32_t	cable_oui;
-	uint8_t		link_up;
-	uint8_t		module_type;
-	uint8_t		link_faults;
+	volatile uint16_t	link_speed;
+	volatile uint16_t	cable_length;
+	volatile uint32_t	cable_oui;
+	volatile uint8_t	link_up;
+	volatile uint8_t	module_type;
+	volatile uint8_t	link_faults;
+	volatile uint8_t	loopback_mode;
+	volatile uint8_t	fduplex;
+	volatile uint8_t	autoneg;
 
-	uint8_t		mac_rcv_mode;
+	volatile uint8_t	mac_rcv_mode;
 
-	uint32_t	max_mtu;
+	volatile uint32_t	max_mtu;
 
 	uint8_t		mac_addr[ETHER_ADDR_LEN];
 
@@ -1703,9 +1703,25 @@ typedef struct _qla_hw {
 	uint32_t	mdump_buffer_size;
 	void		*mdump_template;
 	uint32_t	mdump_template_size;
+	uint64_t	mdump_usec_ts;
 
+#define Q8_MBX_COMP_MSECS	(19)
+	uint64_t	mbx_comp_msecs[Q8_MBX_COMP_MSECS];
 	/* driver state related */
 	void		*drvr_state;
+
+	/* slow path trace */
+	uint32_t	sp_log_stop_events;
+#define Q8_SP_LOG_STOP_HBEAT_FAILURE		0x001
+#define Q8_SP_LOG_STOP_TEMP_FAILURE		0x002
+#define Q8_SP_LOG_STOP_HW_INIT_FAILURE		0x004
+#define Q8_SP_LOG_STOP_IF_START_FAILURE		0x008
+#define Q8_SP_LOG_STOP_ERR_RECOVERY_FAILURE	0x010
+
+	uint32_t	sp_log_stop;
+	uint32_t	sp_log_index;
+	uint32_t	sp_log_num_entries;
+	void		*sp_log;
 } qla_hw_t;
 
 #define QL_UPDATE_RDS_PRODUCER_INDEX(ha, prod_reg, val) \

Modified: stable/11/sys/dev/qlxgbe/ql_inline.h
==============================================================================
--- stable/11/sys/dev/qlxgbe/ql_inline.h	Tue Mar  6 22:45:45 2018	(r330554)
+++ stable/11/sys/dev/qlxgbe/ql_inline.h	Tue Mar  6 23:12:32 2018	(r330555)
@@ -166,7 +166,7 @@ qla_lock(qla_host_t *ha, const char *str, uint32_t tim
 	while (1) {
 		mtx_lock(&ha->hw_lock);
 
-		if (ha->qla_detach_active) {
+		if (ha->qla_detach_active || ha->offline) {
 			mtx_unlock(&ha->hw_lock);
 			break;
 		}
@@ -191,7 +191,10 @@ qla_lock(qla_host_t *ha, const char *str, uint32_t tim
 		}
 	}
 
-	//device_printf(ha->pci_dev, "%s: %s ret = %d\n", __func__, str,ret);
+//	if (!ha->enable_error_recovery)
+//		device_printf(ha->pci_dev, "%s: %s ret = %d\n", __func__,
+//			str,ret);
+
 	return (ret);
 }
 
@@ -202,7 +205,9 @@ qla_unlock(qla_host_t *ha, const char *str)
 	ha->hw_lock_held = 0;
 	ha->qla_unlock = str;
 	mtx_unlock(&ha->hw_lock);
-	//device_printf(ha->pci_dev, "%s: %s\n", __func__, str);
+
+//	if (!ha->enable_error_recovery)
+//		device_printf(ha->pci_dev, "%s: %s\n", __func__, str);
 
 	return;
 }

Modified: stable/11/sys/dev/qlxgbe/ql_ioctl.c
==============================================================================
--- stable/11/sys/dev/qlxgbe/ql_ioctl.c	Tue Mar  6 22:45:45 2018	(r330554)
+++ stable/11/sys/dev/qlxgbe/ql_ioctl.c	Tue Mar  6 23:12:32 2018	(r330555)
@@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
 #include "ql_ver.h"
 #include "ql_dbg.h"
 
+static int ql_slowpath_log(qla_host_t *ha, qla_sp_log_t *log);
 static int ql_drvr_state(qla_host_t *ha, qla_driver_state_t *drvr_state);
 static uint32_t ql_drvr_state_size(qla_host_t *ha);
 static int ql_eioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag,
@@ -224,6 +225,7 @@ ql_eioctl(struct cdev *dev, u_long cmd, caddr_t data, 
 	case QLA_RD_FW_DUMP:
 
 		if (ha->hw.mdump_init == 0) {
+			device_printf(pci_dev, "%s: minidump not initialized\n", __func__);
 			rval = EINVAL;
 			break;
 		}
@@ -233,45 +235,85 @@ ql_eioctl(struct cdev *dev, u_long cmd, caddr_t data, 
 		if ((fw_dump->minidump == NULL) ||
 			(fw_dump->minidump_size != (ha->hw.mdump_buffer_size +
 				ha->hw.mdump_template_size))) {
+			device_printf(pci_dev,
+				"%s: minidump buffer [%p] size = [%d, %d] invalid\n", __func__,
+				fw_dump->minidump, fw_dump->minidump_size,
+				(ha->hw.mdump_buffer_size + ha->hw.mdump_template_size));
 			rval = EINVAL;
 			break;
 		}
 
-		if (QLA_LOCK(ha, __func__, QLA_LOCK_DEFAULT_MS_TIMEOUT, 0) == 0) {
-			if (!ha->hw.mdump_done)
-				ha->qla_initiate_recovery = 1;
-			QLA_UNLOCK(ha, __func__);
-		} else {
+		if ((ha->pci_func & 0x1)) {
+			device_printf(pci_dev, "%s: mindump allowed only on Port0\n", __func__);
 			rval = ENXIO;
 			break;
 		}
+
+		fw_dump->saved = 1;
+
+		if (ha->offline) {
+
+			if (ha->enable_minidump)
+				ql_minidump(ha);
+
+			fw_dump->saved = 0;
+			fw_dump->usec_ts = ha->hw.mdump_usec_ts;
+
+			if (!ha->hw.mdump_done) {
+				device_printf(pci_dev,
+					"%s: port offline minidump failed\n", __func__);
+				rval = ENXIO;
+				break;
+			}
+		} else {
+
+			if (QLA_LOCK(ha, __func__, QLA_LOCK_DEFAULT_MS_TIMEOUT, 0) == 0) {
+				if (!ha->hw.mdump_done) {
+					fw_dump->saved = 0;
+					QL_INITIATE_RECOVERY(ha);
+					device_printf(pci_dev, "%s: recovery initiated "
+						" to trigger minidump\n",
+						__func__);
+				}
+				QLA_UNLOCK(ha, __func__);
+			} else {
+				device_printf(pci_dev, "%s: QLA_LOCK() failed0\n", __func__);
+				rval = ENXIO;
+				break;
+			}
 	
 #define QLNX_DUMP_WAIT_SECS	30
 
-		count = QLNX_DUMP_WAIT_SECS * 1000;
+			count = QLNX_DUMP_WAIT_SECS * 1000;
 
-		while (count) {
-			if (ha->hw.mdump_done)
-				break;
-			qla_mdelay(__func__, 100);
-			count -= 100;
-		}
+			while (count) {
+				if (ha->hw.mdump_done)
+					break;
+				qla_mdelay(__func__, 100);
+				count -= 100;
+			}
 
-		if (!ha->hw.mdump_done) {
-			rval = ENXIO;
-			break;
-		}
+			if (!ha->hw.mdump_done) {
+				device_printf(pci_dev,
+					"%s: port not offline minidump failed\n", __func__);
+				rval = ENXIO;
+				break;
+			}
+			fw_dump->usec_ts = ha->hw.mdump_usec_ts;
 			
-		if (QLA_LOCK(ha, __func__, QLA_LOCK_DEFAULT_MS_TIMEOUT, 0) == 0) {
-			ha->hw.mdump_done = 0;
-			QLA_UNLOCK(ha, __func__);
-		} else {
-			rval = ENXIO;
-			break;
+			if (QLA_LOCK(ha, __func__, QLA_LOCK_DEFAULT_MS_TIMEOUT, 0) == 0) {
+				ha->hw.mdump_done = 0;
+				QLA_UNLOCK(ha, __func__);
+			} else {
+				device_printf(pci_dev, "%s: QLA_LOCK() failed1\n", __func__);
+				rval = ENXIO;
+				break;
+			}
 		}
 
 		if ((rval = copyout(ha->hw.mdump_template,
 			fw_dump->minidump, ha->hw.mdump_template_size))) {
+			device_printf(pci_dev, "%s: template copyout failed\n", __func__);
 			rval = ENXIO;
 			break;
 		}
@@ -279,14 +321,20 @@ ql_eioctl(struct cdev *dev, u_long cmd, caddr_t data, 
 		if ((rval = copyout(ha->hw.mdump_buffer,
 				((uint8_t *)fw_dump->minidump +
 					ha->hw.mdump_template_size),
-				ha->hw.mdump_buffer_size)))
+				ha->hw.mdump_buffer_size))) {
+			device_printf(pci_dev, "%s: minidump copyout failed\n", __func__);
 			rval = ENXIO;
+		}
 		break;
 
 	case QLA_RD_DRVR_STATE:
 		rval = ql_drvr_state(ha, (qla_driver_state_t *)data);
 		break;
 
+	case QLA_RD_SLOWPATH_LOG:
+		rval = ql_slowpath_log(ha, (qla_sp_log_t *)data);
+		break;
+
 	case QLA_RD_PCI_IDS:
 		pci_ids = (qla_rd_pci_ids_t *)data;
 		pci_ids->ven_id = pci_get_vendor(pci_dev);
@@ -304,12 +352,12 @@ ql_eioctl(struct cdev *dev, u_long cmd, caddr_t data, 
 }
 
 
+
 static int
 ql_drvr_state(qla_host_t *ha, qla_driver_state_t *state)
 {
 	int rval = 0;
 	uint32_t drvr_state_size;
-	qla_drvr_state_hdr_t *hdr;
 
 	drvr_state_size = ql_drvr_state_size(ha);
 
@@ -324,11 +372,8 @@ ql_drvr_state(qla_host_t *ha, qla_driver_state_t *stat
 	if (ha->hw.drvr_state == NULL)
 		return (ENOMEM);
 
-	hdr = ha->hw.drvr_state;
+	ql_capture_drvr_state(ha);
 
-	if (!hdr->drvr_version_major)
-		ql_capture_drvr_state(ha);
-
 	rval = copyout(ha->hw.drvr_state, state->buffer, drvr_state_size);
 
 	bzero(ha->hw.drvr_state, drvr_state_size);
@@ -416,22 +461,26 @@ ql_capture_drvr_state(qla_host_t *ha)
 {
 	uint8_t *state_buffer;
 	uint8_t *ptr;
-	uint32_t drvr_state_size;
 	qla_drvr_state_hdr_t *hdr;
 	uint32_t size;
 	int i;
 
-	drvr_state_size = ql_drvr_state_size(ha);
-
 	state_buffer =  ha->hw.drvr_state;
 
 	if (state_buffer == NULL)
 		return;
-	
-	bzero(state_buffer, drvr_state_size);
 
 	hdr = (qla_drvr_state_hdr_t *)state_buffer;
+	
+	hdr->saved = 0;
 
+	if (hdr->drvr_version_major) {
+		hdr->saved = 1;
+		return;
+	}
+
+	hdr->usec_ts = qla_get_usec_timestamp();
+
 	hdr->drvr_version_major = QLA_VERSION_MAJOR;
 	hdr->drvr_version_minor = QLA_VERSION_MINOR;
 	hdr->drvr_version_build = QLA_VERSION_BUILD;
@@ -512,6 +561,9 @@ ql_alloc_drvr_state_buffer(qla_host_t *ha)
 
 	ha->hw.drvr_state =  malloc(drvr_state_size, M_QLA83XXBUF, M_NOWAIT);	
 
+	if (ha->hw.drvr_state != NULL)
+		bzero(ha->hw.drvr_state, drvr_state_size);
+
 	return;
 }
 
@@ -521,5 +573,95 @@ ql_free_drvr_state_buffer(qla_host_t *ha)
 	if (ha->hw.drvr_state != NULL)
 		free(ha->hw.drvr_state, M_QLA83XXBUF);
 	return;
+}
+
+void
+ql_sp_log(qla_host_t *ha, uint16_t fmtstr_idx, uint16_t num_params,
+	uint32_t param0, uint32_t param1, uint32_t param2, uint32_t param3,
+	uint32_t param4)
+{
+	qla_sp_log_entry_t *sp_e, *sp_log;
+
+	if (((sp_log = ha->hw.sp_log) == NULL) || ha->hw.sp_log_stop)
+		return;
+
+	mtx_lock(&ha->sp_log_lock);
+
+	sp_e = &sp_log[ha->hw.sp_log_index];
+
+	bzero(sp_e, sizeof (qla_sp_log_entry_t));
+
+	sp_e->fmtstr_idx = fmtstr_idx;
+	sp_e->num_params = num_params;
+
+	sp_e->usec_ts = qla_get_usec_timestamp();
+
+	sp_e->params[0] = param0;
+	sp_e->params[1] = param1;
+	sp_e->params[2] = param2;
+	sp_e->params[3] = param3;
+	sp_e->params[4] = param4;
+
+	ha->hw.sp_log_index = (ha->hw.sp_log_index + 1) & (NUM_LOG_ENTRIES - 1);
+
+	if (ha->hw.sp_log_num_entries < NUM_LOG_ENTRIES)
+		ha->hw.sp_log_num_entries++;
+
+	mtx_unlock(&ha->sp_log_lock);
+
+	return;
+}
+
+void
+ql_alloc_sp_log_buffer(qla_host_t *ha)
+{
+	uint32_t size;
+
+	size = (sizeof(qla_sp_log_entry_t)) * NUM_LOG_ENTRIES;
+
+	ha->hw.sp_log =  malloc(size, M_QLA83XXBUF, M_NOWAIT);	
+
+	if (ha->hw.sp_log != NULL)
+		bzero(ha->hw.sp_log, size);
+
+	ha->hw.sp_log_index = 0;
+	ha->hw.sp_log_num_entries = 0;
+
+	return;
+}
+
+void
+ql_free_sp_log_buffer(qla_host_t *ha)
+{
+	if (ha->hw.sp_log != NULL)
+		free(ha->hw.sp_log, M_QLA83XXBUF);
+	return;
+}
+

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201803062312.w26NCW4n088637>