Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 6 Jan 2010 14:11:56 -0800
From:      Matt Reimer <mattjreimer@gmail.com>
To:        freebsd-fs <freebsd-fs@freebsd.org>, Doug Rabson <dfr@rabson.org>,  Pawel Jakub Dawidek <pjd@freebsd.org>
Subject:   Fwd: PATCH: teach (gpt)zfsboot, zfsloader to discern vdev status  correctly
Message-ID:  <f383264b1001061411i5feb09f2pdfb9e7b29bdff1c1@mail.gmail.com>
In-Reply-To: <f383264b0912141446x435c7275lca1c315a7324a4bb@mail.gmail.com>
References:  <f383264b0912141446x435c7275lca1c315a7324a4bb@mail.gmail.com>

next in thread | previous in thread | raw e-mail | index | archive | help

[-- Attachment #1 --]
Would one of you be able to review this patch and commit it if it's
acceptable? This is the same patch I submitted last month.

Matt

----

Instead of assuming all vdevs are healthy, check the newest vdev label
for each vdev's status. Booting from a degraded vdev should now be
more robust.

Sponsored by: VPOP Technologies, Inc.

Matt Reimer

(Note that much of this patch is merely whitespace change due to a
block needing to be reindented. I've attached
correct-status-nowhitespace.patch to make review easier.)

[-- Attachment #2 --]
--- /sys/cddl/boot/zfs/zfsimpl.h.ORIG	2009-11-21 07:02:35.000000000 -0800
+++ /sys/cddl/boot/zfs/zfsimpl.h	2009-12-07 13:50:26.000000000 -0800
@@ -546,7 +546,6 @@
 #define	ZPOOL_CONFIG_DTL		"DTL"
 #define	ZPOOL_CONFIG_STATS		"stats"
 #define	ZPOOL_CONFIG_WHOLE_DISK		"whole_disk"
-#define	ZPOOL_CONFIG_OFFLINE		"offline"
 #define	ZPOOL_CONFIG_ERRCOUNT		"error_count"
 #define	ZPOOL_CONFIG_NOT_PRESENT	"not_present"
 #define	ZPOOL_CONFIG_SPARES		"spares"
@@ -556,6 +555,16 @@
 #define	ZPOOL_CONFIG_HOSTNAME		"hostname"
 #define	ZPOOL_CONFIG_TIMESTAMP		"timestamp" /* not stored on disk */
 
+/*
+ * The persistent vdev state is stored as separate values rather than a single
+ * 'vdev_state' entry.  This is because a device can be in multiple states, such
+ * as offline and degraded.
+ */
+#define ZPOOL_CONFIG_OFFLINE            "offline"
+#define ZPOOL_CONFIG_FAULTED            "faulted"
+#define ZPOOL_CONFIG_DEGRADED           "degraded"
+#define ZPOOL_CONFIG_REMOVED            "removed"
+
 #define	VDEV_TYPE_ROOT			"root"
 #define	VDEV_TYPE_MIRROR		"mirror"
 #define	VDEV_TYPE_REPLACING		"replacing"
@@ -588,7 +597,9 @@
 	VDEV_STATE_UNKNOWN = 0,	/* Uninitialized vdev			*/
 	VDEV_STATE_CLOSED,	/* Not currently open			*/
 	VDEV_STATE_OFFLINE,	/* Not allowed to open			*/
+        VDEV_STATE_REMOVED,	/* Explicitly removed from system	*/
 	VDEV_STATE_CANT_OPEN,	/* Tried to open, but failed		*/
+        VDEV_STATE_FAULTED,	/* External request to fault device	*/
 	VDEV_STATE_DEGRADED,	/* Replicated vdev with unhealthy kids	*/
 	VDEV_STATE_HEALTHY	/* Presumed good			*/
 } vdev_state_t;
--- /sys/boot/zfs/zfsimpl.c.ORIG	2009-11-21 07:02:35.000000000 -0800
+++ /sys/boot/zfs/zfsimpl.c	2009-12-07 14:36:20.000000000 -0800
@@ -404,7 +404,7 @@
 }
 
 static int
-vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp)
+vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp, int is_newer)
 {
 	int rc;
 	uint64_t guid, id, ashift, nparity;
@@ -412,7 +412,8 @@
 	const char *path;
 	vdev_t *vdev, *kid;
 	const unsigned char *kids;
-	int nkids, i;
+	int nkids, i, is_new;
+	uint64_t is_offline, is_faulted, is_degraded, is_removed;
 
 	if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
 			DATA_TYPE_UINT64, 0, &guid)
@@ -424,17 +425,6 @@
 		return (ENOENT);
 	}
 
-	/*
-	 * Assume that if we've seen this vdev tree before, this one
-	 * will be identical.
-	 */
-	vdev = vdev_find(guid);
-	if (vdev) {
-		if (vdevp)
-			*vdevp = vdev;
-		return (0);
-	}
-
 	if (strcmp(type, VDEV_TYPE_MIRROR)
 	    && strcmp(type, VDEV_TYPE_DISK)
 	    && strcmp(type, VDEV_TYPE_RAIDZ)) {
@@ -442,44 +432,95 @@
 		return (EIO);
 	}
 
-	if (!strcmp(type, VDEV_TYPE_MIRROR))
-		vdev = vdev_create(guid, vdev_mirror_read);
-	else if (!strcmp(type, VDEV_TYPE_RAIDZ))
-		vdev = vdev_create(guid, vdev_raidz_read);
-	else
-		vdev = vdev_create(guid, vdev_disk_read);
+	is_offline = is_removed = is_faulted = is_degraded = 0;
+
+	nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, 0,
+			&is_offline);
+	nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, 0,
+			&is_removed);
+	nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, 0,
+			&is_faulted);
+	nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 0,
+			&is_degraded);
+
+	vdev = vdev_find(guid);
+	if (!vdev) {
+
+		is_new = 1;
+
+		if (!strcmp(type, VDEV_TYPE_MIRROR))
+			vdev = vdev_create(guid, vdev_mirror_read);
+		else if (!strcmp(type, VDEV_TYPE_RAIDZ))
+			vdev = vdev_create(guid, vdev_raidz_read);
+		else
+			vdev = vdev_create(guid, vdev_disk_read);
+
+		vdev->v_id = id;
+		if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
+			DATA_TYPE_UINT64, 0, &ashift) == 0)
+			vdev->v_ashift = ashift;
+		else
+			vdev->v_ashift = 0;
+		if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
+			DATA_TYPE_UINT64, 0, &nparity) == 0)
+			vdev->v_nparity = nparity;
+		else
+			vdev->v_nparity = 0;
+		if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
+				DATA_TYPE_STRING, 0, &path) == 0) {
+			if (strlen(path) > 5
+			    && path[0] == '/'
+			    && path[1] == 'd'
+			    && path[2] == 'e'
+			    && path[3] == 'v'
+			    && path[4] == '/')
+				path += 5;
+			vdev->v_name = strdup(path);
+		} else {
+			if (!strcmp(type, "raidz")) {
+				if (vdev->v_nparity == 1)
+					vdev->v_name = "raidz1";
+				else
+					vdev->v_name = "raidz2";
+			} else {
+				vdev->v_name = strdup(type);
+			}
+		}
+
+		if (is_offline)
+			vdev->v_state = VDEV_STATE_OFFLINE;
+		else if (is_removed)
+			vdev->v_state = VDEV_STATE_REMOVED;
+		else if (is_faulted)
+			vdev->v_state = VDEV_STATE_FAULTED;
+		else if (is_degraded)
+			vdev->v_state = VDEV_STATE_DEGRADED;
+		else
+			vdev->v_state = VDEV_STATE_HEALTHY;
 
-	vdev->v_id = id;
-	if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
-		DATA_TYPE_UINT64, 0, &ashift) == 0)
-		vdev->v_ashift = ashift;
-	else
-		vdev->v_ashift = 0;
-	if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
-		DATA_TYPE_UINT64, 0, &nparity) == 0)
-		vdev->v_nparity = nparity;
-	else
-		vdev->v_nparity = 0;
-	if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
-			DATA_TYPE_STRING, 0, &path) == 0) {
-		if (strlen(path) > 5
-		    && path[0] == '/'
-		    && path[1] == 'd'
-		    && path[2] == 'e'
-		    && path[3] == 'v'
-		    && path[4] == '/')
-			path += 5;
-		vdev->v_name = strdup(path);
 	} else {
-		if (!strcmp(type, "raidz")) {
-			if (vdev->v_nparity == 1)
-				vdev->v_name = "raidz1";
+
+		is_new = 0;
+
+		if (is_newer) {
+
+			/* We've already seen this vdev, but from an older
+			 * vdev label, so let's refresh its state from the
+			 * newer label. */
+
+			if (is_offline)
+				vdev->v_state = VDEV_STATE_OFFLINE;
+			else if (is_removed)
+				vdev->v_state = VDEV_STATE_REMOVED;
+			else if (is_faulted)
+				vdev->v_state = VDEV_STATE_FAULTED;
+			else if (is_degraded)
+				vdev->v_state = VDEV_STATE_DEGRADED;
 			else
-				vdev->v_name = "raidz2";
-		} else {
-			vdev->v_name = strdup(type);
+				vdev->v_state = VDEV_STATE_HEALTHY;
 		}
 	}
+
 	rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
 			 DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
 	/*
@@ -488,10 +529,12 @@
 	if (rc == 0) {
 		vdev->v_nchildren = nkids;
 		for (i = 0; i < nkids; i++) {
-			rc = vdev_init_from_nvlist(kids, &kid);
+			rc = vdev_init_from_nvlist(kids, &kid, is_newer);
 			if (rc)
 				return (rc);
-			STAILQ_INSERT_TAIL(&vdev->v_children, kid, v_childlink);
+			if (is_new)
+				STAILQ_INSERT_TAIL(&vdev->v_children, kid,
+						   v_childlink);
 			kids = nvlist_next(kids);
 		}
 	} else {
@@ -593,7 +636,9 @@
 		"UNKNOWN",
 		"CLOSED",
 		"OFFLINE",
+		"REMOVED",
 		"CANT_OPEN",
+		"FAULTED",
 		"DEGRADED",
 		"ONLINE"
 	};
@@ -711,7 +756,7 @@
 	uint64_t pool_txg, pool_guid;
 	const char *pool_name;
 	const unsigned char *vdevs;
-	int i, rc;
+	int i, rc, is_newer;
 	char upbuf[1024];
 	const struct uberblock *up;
 
@@ -793,12 +838,15 @@
 		spa = spa_create(pool_guid);
 		spa->spa_name = strdup(pool_name);
 	}
-	if (pool_txg > spa->spa_txg)
+	if (pool_txg > spa->spa_txg) {
 		spa->spa_txg = pool_txg;
+		is_newer = 1;
+	} else
+		is_newer = 0;
 
 	/*
 	 * Get the vdev tree and create our in-core copy of it.
-	 * If we already have a healthy vdev with this guid, this must
+	 * If we already have a vdev with this guid, this must
 	 * be some kind of alias (overlapping slices, dangerously dedicated
 	 * disks etc).
 	 */
@@ -808,16 +856,16 @@
 		return (EIO);
 	}
 	vdev = vdev_find(guid);
-	if (vdev && vdev->v_state == VDEV_STATE_HEALTHY) {
+	if (vdev && vdev->v_phys_read)	/* Has this vdev already been inited? */
 		return (EIO);
-	}
 
 	if (nvlist_find(nvlist,
 			ZPOOL_CONFIG_VDEV_TREE,
 			DATA_TYPE_NVLIST, 0, &vdevs)) {
 		return (EIO);
 	}
-	rc = vdev_init_from_nvlist(vdevs, &top_vdev);
+
+	rc = vdev_init_from_nvlist(vdevs, &top_vdev, is_newer);
 	if (rc)
 		return (rc);
 
@@ -838,7 +886,6 @@
 	if (vdev) {
 		vdev->v_phys_read = read;
 		vdev->v_read_priv = read_priv;
-		vdev->v_state = VDEV_STATE_HEALTHY;
 	} else {
 		printf("ZFS: inconsistent nvlist contents\n");
 		return (EIO);

[-- Attachment #3 --]
--- zfs/zfsimpl.c.ORIG	2009-11-21 15:02:35.000000000 +0000
+++ zfs/zfsimpl.c	2009-12-11 12:26:38.745954297 +0000
@@ -51,7 +51,7 @@
 static char *zap_scratch;
 static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
 
-#define TEMP_SIZE	(1*SPA_MAXBLOCKSIZE)
+#define TEMP_SIZE	(1024 * 1024)
 
 static int zio_read(spa_t *spa, const blkptr_t *bp, void *buf);
 
@@ -404,7 +404,7 @@
 }
 
 static int
-vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp)
+vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp, int is_newer)
 {
 	int rc;
 	uint64_t guid, id, ashift, nparity;
@@ -412,7 +412,8 @@
 	const char *path;
 	vdev_t *vdev, *kid;
 	const unsigned char *kids;
-	int nkids, i;
+	int nkids, i, is_new;
+	uint64_t is_offline, is_faulted, is_degraded, is_removed;
 
 	if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
 			DATA_TYPE_UINT64, 0, &guid)
@@ -424,17 +425,6 @@
 		return (ENOENT);
 	}
 
-	/*
-	 * Assume that if we've seen this vdev tree before, this one
-	 * will be identical.
-	 */
-	vdev = vdev_find(guid);
-	if (vdev) {
-		if (vdevp)
-			*vdevp = vdev;
-		return (0);
-	}
-
 	if (strcmp(type, VDEV_TYPE_MIRROR)
 	    && strcmp(type, VDEV_TYPE_DISK)
 	    && strcmp(type, VDEV_TYPE_RAIDZ)) {
@@ -442,6 +432,22 @@
 		return (EIO);
 	}
 
+	is_offline = is_removed = is_faulted = is_degraded = 0;
+
+	nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, 0,
+			&is_offline);
+	nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, 0,
+			&is_removed);
+	nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, 0,
+			&is_faulted);
+	nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 0,
+			&is_degraded);
+
+	vdev = vdev_find(guid);
+	if (!vdev) {
+
+		is_new = 1;
+
 	if (!strcmp(type, VDEV_TYPE_MIRROR))
 		vdev = vdev_create(guid, vdev_mirror_read);
 	else if (!strcmp(type, VDEV_TYPE_RAIDZ))
@@ -480,6 +486,41 @@
 			vdev->v_name = strdup(type);
 		}
 	}
+
+		if (is_offline)
+			vdev->v_state = VDEV_STATE_OFFLINE;
+		else if (is_removed)
+			vdev->v_state = VDEV_STATE_REMOVED;
+		else if (is_faulted)
+			vdev->v_state = VDEV_STATE_FAULTED;
+		else if (is_degraded)
+			vdev->v_state = VDEV_STATE_DEGRADED;
+		else
+			vdev->v_state = VDEV_STATE_HEALTHY;
+
+	} else {
+
+		is_new = 0;
+
+		if (is_newer) {
+
+			/* We've already seen this vdev, but from an older
+			 * vdev label, so let's refresh its state from the
+			 * newer label. */
+
+			if (is_offline)
+				vdev->v_state = VDEV_STATE_OFFLINE;
+			else if (is_removed)
+				vdev->v_state = VDEV_STATE_REMOVED;
+			else if (is_faulted)
+				vdev->v_state = VDEV_STATE_FAULTED;
+			else if (is_degraded)
+				vdev->v_state = VDEV_STATE_DEGRADED;
+			else
+				vdev->v_state = VDEV_STATE_HEALTHY;
+		}
+	}
+
 	rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
 			 DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
 	/*
@@ -488,10 +529,12 @@
 	if (rc == 0) {
 		vdev->v_nchildren = nkids;
 		for (i = 0; i < nkids; i++) {
-			rc = vdev_init_from_nvlist(kids, &kid);
+			rc = vdev_init_from_nvlist(kids, &kid, is_newer);
 			if (rc)
 				return (rc);
-			STAILQ_INSERT_TAIL(&vdev->v_children, kid, v_childlink);
+			if (is_new)
+				STAILQ_INSERT_TAIL(&vdev->v_children, kid,
+						   v_childlink);
 			kids = nvlist_next(kids);
 		}
 	} else {
@@ -593,7 +636,9 @@
 		"UNKNOWN",
 		"CLOSED",
 		"OFFLINE",
+		"REMOVED",
 		"CANT_OPEN",
+		"FAULTED",
 		"DEGRADED",
 		"ONLINE"
 	};
@@ -711,7 +756,7 @@
 	uint64_t pool_txg, pool_guid;
 	const char *pool_name;
 	const unsigned char *vdevs;
-	int i, rc;
+	int i, rc, is_newer;
 	char upbuf[1024];
 	const struct uberblock *up;
 
@@ -793,12 +838,15 @@
 		spa = spa_create(pool_guid);
 		spa->spa_name = strdup(pool_name);
 	}
-	if (pool_txg > spa->spa_txg)
+	if (pool_txg > spa->spa_txg) {
 		spa->spa_txg = pool_txg;
+		is_newer = 1;
+	} else
+		is_newer = 0;
 
 	/*
 	 * Get the vdev tree and create our in-core copy of it.
-	 * If we already have a healthy vdev with this guid, this must
+	 * If we already have a vdev with this guid, this must
 	 * be some kind of alias (overlapping slices, dangerously dedicated
 	 * disks etc).
 	 */
@@ -808,16 +856,16 @@
 		return (EIO);
 	}
 	vdev = vdev_find(guid);
-	if (vdev && vdev->v_state == VDEV_STATE_HEALTHY) {
+	if (vdev && vdev->v_phys_read)	/* Has this vdev already been inited? */
 		return (EIO);
-	}
 
 	if (nvlist_find(nvlist,
 			ZPOOL_CONFIG_VDEV_TREE,
 			DATA_TYPE_NVLIST, 0, &vdevs)) {
 		return (EIO);
 	}
-	rc = vdev_init_from_nvlist(vdevs, &top_vdev);
+
+	rc = vdev_init_from_nvlist(vdevs, &top_vdev, is_newer);
 	if (rc)
 		return (rc);
 
@@ -838,7 +886,6 @@
 	if (vdev) {
 		vdev->v_phys_read = read;
 		vdev->v_read_priv = read_priv;
-		vdev->v_state = VDEV_STATE_HEALTHY;
 	} else {
 		printf("ZFS: inconsistent nvlist contents\n");
 		return (EIO);

Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?f383264b1001061411i5feb09f2pdfb9e7b29bdff1c1>