Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 21 Sep 2009 20:16:10 +0000 (UTC)
From:      Andrew Gallatin <gallatin@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r197395 - head/sys/dev/mxge
Message-ID:  <200909212016.n8LKGAjP098398@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: gallatin
Date: Mon Sep 21 20:16:10 2009
New Revision: 197395
URL: http://svn.freebsd.org/changeset/base/197395

Log:
  Improve mxge watchdog routine's ability to reliably reset a failed NIC:
  
  - Mark the link as down, so if watchdog reset fails, link watching
      failover software can notice it
  - Don't send MXGEFW_CMD_ETHERNET_DOWN if the NIC has been reset, it is
      not needed, and will fail on a freshly reset NIC.
  - Ensure the transmit routines aren't attempting to PIO write to doorbells
      while the NIC is being reset.
  - Download the correct f/w, rather than using the EEPROM f/w after reset.
  - Export a count of the number of watchdog resets via sysctl
  - Zero all f/w stats at reset.  This will lead to less confusing
      diagnostic output when investigating NIC failures.
  
  MFC after:	3 days

Modified:
  head/sys/dev/mxge/if_mxge.c

Modified: head/sys/dev/mxge/if_mxge.c
==============================================================================
--- head/sys/dev/mxge/if_mxge.c	Mon Sep 21 18:02:02 2009	(r197394)
+++ head/sys/dev/mxge/if_mxge.c	Mon Sep 21 20:16:10 2009	(r197395)
@@ -144,7 +144,7 @@ MODULE_DEPEND(mxge, zlib, 1, 1, 1);
 
 static int mxge_load_firmware(mxge_softc_t *sc, int adopt);
 static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data);
-static int mxge_close(mxge_softc_t *sc);
+static int mxge_close(mxge_softc_t *sc, int down);
 static int mxge_open(mxge_softc_t *sc);
 static void mxge_tick(void *arg);
 
@@ -1309,8 +1309,7 @@ mxge_reset(mxge_softc_t *sc, int interru
 		ss->lro_queued = 0;
 		ss->lro_flushed = 0;
 		if (ss->fw_stats != NULL) {
-			ss->fw_stats->valid = 0;
-			ss->fw_stats->send_done_count = 0;
+			bzero(ss->fw_stats, sizeof *ss->fw_stats);
 		}
 	}
 	sc->rdma_tags_available = 15;
@@ -1421,7 +1420,7 @@ mxge_change_lro_locked(mxge_softc_t *sc,
 		ifp->if_capenable |= IFCAP_LRO;
 	sc->lro_cnt = lro_cnt;
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
-		mxge_close(sc);
+		mxge_close(sc, 0);
 		err = mxge_open(sc);
 	}
 	return err;
@@ -1537,6 +1536,10 @@ mxge_add_sysctls(mxge_softc_t *sc)
 		       "read_write_dma_MBs",
 		       CTLFLAG_RD, &sc->read_write_dma,
 		       0, "DMA concurrent Read/Write speed in MB/s");
+	SYSCTL_ADD_INT(ctx, children, OID_AUTO, 
+		       "watchdog_resets",
+		       CTLFLAG_RD, &sc->watchdog_resets,
+		       0, "Number of times NIC was reset");
 
 
 	/* performance related tunables */
@@ -3648,7 +3651,7 @@ abort:
 }
 
 static int
-mxge_close(mxge_softc_t *sc)
+mxge_close(mxge_softc_t *sc, int down)
 {
 	mxge_cmd_t cmd;
 	int err, old_down_cnt;
@@ -3665,21 +3668,23 @@ mxge_close(mxge_softc_t *sc)
 	}
 #endif
 	sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
-	old_down_cnt = sc->down_cnt;
-	wmb();
-	err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
-	if (err) {
-		device_printf(sc->dev, "Couldn't bring down link\n");
-	}
-	if (old_down_cnt == sc->down_cnt) {
-		/* wait for down irq */
-		DELAY(10 * sc->intr_coal_delay);
-	}
-	wmb();
-	if (old_down_cnt == sc->down_cnt) {
-		device_printf(sc->dev, "never got down irq\n");
+	if (!down) {
+		old_down_cnt = sc->down_cnt;
+		wmb();
+		err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd);
+		if (err) {
+			device_printf(sc->dev,
+				      "Couldn't bring down link\n");
+		}
+		if (old_down_cnt == sc->down_cnt) {
+			/* wait for down irq */
+			DELAY(10 * sc->intr_coal_delay);
+		}
+		wmb();
+		if (old_down_cnt == sc->down_cnt) {
+			device_printf(sc->dev, "never got down irq\n");
+		}
 	}
-
 	mxge_free_mbufs(sc);
 
 	return 0;
@@ -3732,8 +3737,9 @@ static int
 mxge_watchdog_reset(mxge_softc_t *sc, int slice)
 {
 	struct pci_devinfo *dinfo;
+	struct mxge_slice_state *ss;
 	mxge_tx_ring_t *tx;
-	int err;
+	int err, running, s, num_tx_slices = 1;
 	uint32_t reboot;
 	uint16_t cmd;
 
@@ -3767,6 +3773,30 @@ mxge_watchdog_reset(mxge_softc_t *sc, in
 		reboot = mxge_read_reboot(sc);
 		device_printf(sc->dev, "NIC rebooted, status = 0x%x\n",
 			      reboot);
+		running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING;
+		if (running) {
+
+			/* 
+			 * quiesce NIC so that TX routines will not try to
+			 * xmit after restoration of BAR
+			 */
+
+			/* Mark the link as down */
+			if (sc->link_state) {
+				sc->link_state = 0;
+				if_link_state_change(sc->ifp,
+						     LINK_STATE_DOWN);
+			}
+#ifdef IFNET_BUF_RING
+			num_tx_slices = sc->num_slices;
+#endif
+			/* grab all TX locks to ensure no tx  */
+			for (s = 0; s < num_tx_slices; s++) {
+				ss = &sc->ss[s];
+				mtx_lock(&ss->tx.mtx);
+			}
+			mxge_close(sc, 1);
+		}
 		/* restore PCI configuration space */
 		dinfo = device_get_ivars(sc->dev);
 		pci_cfg_restore(sc->dev, dinfo);
@@ -3774,10 +3804,22 @@ mxge_watchdog_reset(mxge_softc_t *sc, in
 		/* and redo any changes we made to our config space */
 		mxge_setup_cfg_space(sc);
 
-		if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) {
-			mxge_close(sc);
-			err = mxge_open(sc);
+		/* reload f/w */
+		err = mxge_load_firmware(sc, 0);
+		if (err) {
+			device_printf(sc->dev,
+				      "Unable to re-load f/w\n");
 		}
+		if (running) {
+			if (!err)
+				err = mxge_open(sc);
+			/* release all TX locks */
+			for (s = 0; s < num_tx_slices; s++) {
+				ss = &sc->ss[s];
+				mtx_unlock(&ss->tx.mtx);
+			}
+		}
+		sc->watchdog_resets++;
 	} else {
 		tx = &sc->ss[slice].tx;
 		device_printf(sc->dev,
@@ -3793,6 +3835,9 @@ mxge_watchdog_reset(mxge_softc_t *sc, in
 			      be32toh(sc->ss->fw_stats->send_done_count));
 		device_printf(sc->dev, "not resetting\n");
 	}
+	if (err)
+		device_printf(sc->dev, "watchdog reset failed\n");
+
 	return (err);
 }
 
@@ -3908,11 +3953,11 @@ mxge_change_mtu(mxge_softc_t *sc, int mt
 	old_mtu = ifp->if_mtu;
 	ifp->if_mtu = mtu;
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
-		mxge_close(sc);
+		mxge_close(sc, 0);
 		err = mxge_open(sc);
 		if (err != 0) {
 			ifp->if_mtu = old_mtu;
-			mxge_close(sc);
+			mxge_close(sc, 0);
 			(void) mxge_open(sc);
 		}
 	}
@@ -3970,7 +4015,7 @@ mxge_ioctl(struct ifnet *ifp, u_long com
 			}
 		} else {
 			if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
-				mxge_close(sc);
+				mxge_close(sc, 0);
 			}
 		}
 		mtx_unlock(&sc->driver_mtx);
@@ -4700,7 +4745,7 @@ mxge_detach(device_t dev)
 	mtx_lock(&sc->driver_mtx);
 	sc->dying = 1;
 	if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING)
-		mxge_close(sc);
+		mxge_close(sc, 0);
 	mtx_unlock(&sc->driver_mtx);
 	ether_ifdetach(sc->ifp);
 	callout_drain(&sc->co_hdl);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200909212016.n8LKGAjP098398>