Date: Wed, 6 Jan 2010 14:11:56 -0800 From: Matt Reimer <mattjreimer@gmail.com> To: freebsd-fs <freebsd-fs@freebsd.org>, Doug Rabson <dfr@rabson.org>, Pawel Jakub Dawidek <pjd@freebsd.org> Subject: Fwd: PATCH: teach (gpt)zfsboot, zfsloader to discern vdev status correctly Message-ID: <f383264b1001061411i5feb09f2pdfb9e7b29bdff1c1@mail.gmail.com> In-Reply-To: <f383264b0912141446x435c7275lca1c315a7324a4bb@mail.gmail.com> References: <f383264b0912141446x435c7275lca1c315a7324a4bb@mail.gmail.com>
next in thread | previous in thread | raw e-mail | index | archive | help
[-- Attachment #1 --]
Would one of you be able to review this patch and commit it if it's
acceptable? This is the same patch I submitted last month.
Matt
----
Instead of assuming all vdevs are healthy, check the newest vdev label
for each vdev's status. Booting from a degraded vdev should now be
more robust.
Sponsored by: VPOP Technologies, Inc.
Matt Reimer
(Note that much of this patch is merely whitespace change due to a
block needing to be reindented. I've attached
correct-status-nowhitespace.patch to make review easier.)
[-- Attachment #2 --]
--- /sys/cddl/boot/zfs/zfsimpl.h.ORIG 2009-11-21 07:02:35.000000000 -0800
+++ /sys/cddl/boot/zfs/zfsimpl.h 2009-12-07 13:50:26.000000000 -0800
@@ -546,7 +546,6 @@
#define ZPOOL_CONFIG_DTL "DTL"
#define ZPOOL_CONFIG_STATS "stats"
#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk"
-#define ZPOOL_CONFIG_OFFLINE "offline"
#define ZPOOL_CONFIG_ERRCOUNT "error_count"
#define ZPOOL_CONFIG_NOT_PRESENT "not_present"
#define ZPOOL_CONFIG_SPARES "spares"
@@ -556,6 +555,16 @@
#define ZPOOL_CONFIG_HOSTNAME "hostname"
#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */
+/*
+ * The persistent vdev state is stored as separate values rather than a single
+ * 'vdev_state' entry. This is because a device can be in multiple states, such
+ * as offline and degraded.
+ */
+#define ZPOOL_CONFIG_OFFLINE "offline"
+#define ZPOOL_CONFIG_FAULTED "faulted"
+#define ZPOOL_CONFIG_DEGRADED "degraded"
+#define ZPOOL_CONFIG_REMOVED "removed"
+
#define VDEV_TYPE_ROOT "root"
#define VDEV_TYPE_MIRROR "mirror"
#define VDEV_TYPE_REPLACING "replacing"
@@ -588,7 +597,9 @@
VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */
VDEV_STATE_CLOSED, /* Not currently open */
VDEV_STATE_OFFLINE, /* Not allowed to open */
+ VDEV_STATE_REMOVED, /* Explicitly removed from system */
VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */
+ VDEV_STATE_FAULTED, /* External request to fault device */
VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */
VDEV_STATE_HEALTHY /* Presumed good */
} vdev_state_t;
--- /sys/boot/zfs/zfsimpl.c.ORIG 2009-11-21 07:02:35.000000000 -0800
+++ /sys/boot/zfs/zfsimpl.c 2009-12-07 14:36:20.000000000 -0800
@@ -404,7 +404,7 @@
}
static int
-vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp)
+vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp, int is_newer)
{
int rc;
uint64_t guid, id, ashift, nparity;
@@ -412,7 +412,8 @@
const char *path;
vdev_t *vdev, *kid;
const unsigned char *kids;
- int nkids, i;
+ int nkids, i, is_new;
+ uint64_t is_offline, is_faulted, is_degraded, is_removed;
if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
DATA_TYPE_UINT64, 0, &guid)
@@ -424,17 +425,6 @@
return (ENOENT);
}
- /*
- * Assume that if we've seen this vdev tree before, this one
- * will be identical.
- */
- vdev = vdev_find(guid);
- if (vdev) {
- if (vdevp)
- *vdevp = vdev;
- return (0);
- }
-
if (strcmp(type, VDEV_TYPE_MIRROR)
&& strcmp(type, VDEV_TYPE_DISK)
&& strcmp(type, VDEV_TYPE_RAIDZ)) {
@@ -442,44 +432,95 @@
return (EIO);
}
- if (!strcmp(type, VDEV_TYPE_MIRROR))
- vdev = vdev_create(guid, vdev_mirror_read);
- else if (!strcmp(type, VDEV_TYPE_RAIDZ))
- vdev = vdev_create(guid, vdev_raidz_read);
- else
- vdev = vdev_create(guid, vdev_disk_read);
+ is_offline = is_removed = is_faulted = is_degraded = 0;
+
+ nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, 0,
+ &is_offline);
+ nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, 0,
+ &is_removed);
+ nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, 0,
+ &is_faulted);
+ nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 0,
+ &is_degraded);
+
+ vdev = vdev_find(guid);
+ if (!vdev) {
+
+ is_new = 1;
+
+ if (!strcmp(type, VDEV_TYPE_MIRROR))
+ vdev = vdev_create(guid, vdev_mirror_read);
+ else if (!strcmp(type, VDEV_TYPE_RAIDZ))
+ vdev = vdev_create(guid, vdev_raidz_read);
+ else
+ vdev = vdev_create(guid, vdev_disk_read);
+
+ vdev->v_id = id;
+ if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
+ DATA_TYPE_UINT64, 0, &ashift) == 0)
+ vdev->v_ashift = ashift;
+ else
+ vdev->v_ashift = 0;
+ if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
+ DATA_TYPE_UINT64, 0, &nparity) == 0)
+ vdev->v_nparity = nparity;
+ else
+ vdev->v_nparity = 0;
+ if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
+ DATA_TYPE_STRING, 0, &path) == 0) {
+ if (strlen(path) > 5
+ && path[0] == '/'
+ && path[1] == 'd'
+ && path[2] == 'e'
+ && path[3] == 'v'
+ && path[4] == '/')
+ path += 5;
+ vdev->v_name = strdup(path);
+ } else {
+ if (!strcmp(type, "raidz")) {
+ if (vdev->v_nparity == 1)
+ vdev->v_name = "raidz1";
+ else
+ vdev->v_name = "raidz2";
+ } else {
+ vdev->v_name = strdup(type);
+ }
+ }
+
+ if (is_offline)
+ vdev->v_state = VDEV_STATE_OFFLINE;
+ else if (is_removed)
+ vdev->v_state = VDEV_STATE_REMOVED;
+ else if (is_faulted)
+ vdev->v_state = VDEV_STATE_FAULTED;
+ else if (is_degraded)
+ vdev->v_state = VDEV_STATE_DEGRADED;
+ else
+ vdev->v_state = VDEV_STATE_HEALTHY;
- vdev->v_id = id;
- if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
- DATA_TYPE_UINT64, 0, &ashift) == 0)
- vdev->v_ashift = ashift;
- else
- vdev->v_ashift = 0;
- if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
- DATA_TYPE_UINT64, 0, &nparity) == 0)
- vdev->v_nparity = nparity;
- else
- vdev->v_nparity = 0;
- if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
- DATA_TYPE_STRING, 0, &path) == 0) {
- if (strlen(path) > 5
- && path[0] == '/'
- && path[1] == 'd'
- && path[2] == 'e'
- && path[3] == 'v'
- && path[4] == '/')
- path += 5;
- vdev->v_name = strdup(path);
} else {
- if (!strcmp(type, "raidz")) {
- if (vdev->v_nparity == 1)
- vdev->v_name = "raidz1";
+
+ is_new = 0;
+
+ if (is_newer) {
+
+ /* We've already seen this vdev, but from an older
+ * vdev label, so let's refresh its state from the
+ * newer label. */
+
+ if (is_offline)
+ vdev->v_state = VDEV_STATE_OFFLINE;
+ else if (is_removed)
+ vdev->v_state = VDEV_STATE_REMOVED;
+ else if (is_faulted)
+ vdev->v_state = VDEV_STATE_FAULTED;
+ else if (is_degraded)
+ vdev->v_state = VDEV_STATE_DEGRADED;
else
- vdev->v_name = "raidz2";
- } else {
- vdev->v_name = strdup(type);
+ vdev->v_state = VDEV_STATE_HEALTHY;
}
}
+
rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
/*
@@ -488,10 +529,12 @@
if (rc == 0) {
vdev->v_nchildren = nkids;
for (i = 0; i < nkids; i++) {
- rc = vdev_init_from_nvlist(kids, &kid);
+ rc = vdev_init_from_nvlist(kids, &kid, is_newer);
if (rc)
return (rc);
- STAILQ_INSERT_TAIL(&vdev->v_children, kid, v_childlink);
+ if (is_new)
+ STAILQ_INSERT_TAIL(&vdev->v_children, kid,
+ v_childlink);
kids = nvlist_next(kids);
}
} else {
@@ -593,7 +636,9 @@
"UNKNOWN",
"CLOSED",
"OFFLINE",
+ "REMOVED",
"CANT_OPEN",
+ "FAULTED",
"DEGRADED",
"ONLINE"
};
@@ -711,7 +756,7 @@
uint64_t pool_txg, pool_guid;
const char *pool_name;
const unsigned char *vdevs;
- int i, rc;
+ int i, rc, is_newer;
char upbuf[1024];
const struct uberblock *up;
@@ -793,12 +838,15 @@
spa = spa_create(pool_guid);
spa->spa_name = strdup(pool_name);
}
- if (pool_txg > spa->spa_txg)
+ if (pool_txg > spa->spa_txg) {
spa->spa_txg = pool_txg;
+ is_newer = 1;
+ } else
+ is_newer = 0;
/*
* Get the vdev tree and create our in-core copy of it.
- * If we already have a healthy vdev with this guid, this must
+ * If we already have a vdev with this guid, this must
* be some kind of alias (overlapping slices, dangerously dedicated
* disks etc).
*/
@@ -808,16 +856,16 @@
return (EIO);
}
vdev = vdev_find(guid);
- if (vdev && vdev->v_state == VDEV_STATE_HEALTHY) {
+ if (vdev && vdev->v_phys_read) /* Has this vdev already been inited? */
return (EIO);
- }
if (nvlist_find(nvlist,
ZPOOL_CONFIG_VDEV_TREE,
DATA_TYPE_NVLIST, 0, &vdevs)) {
return (EIO);
}
- rc = vdev_init_from_nvlist(vdevs, &top_vdev);
+
+ rc = vdev_init_from_nvlist(vdevs, &top_vdev, is_newer);
if (rc)
return (rc);
@@ -838,7 +886,6 @@
if (vdev) {
vdev->v_phys_read = read;
vdev->v_read_priv = read_priv;
- vdev->v_state = VDEV_STATE_HEALTHY;
} else {
printf("ZFS: inconsistent nvlist contents\n");
return (EIO);
[-- Attachment #3 --]
--- zfs/zfsimpl.c.ORIG 2009-11-21 15:02:35.000000000 +0000
+++ zfs/zfsimpl.c 2009-12-11 12:26:38.745954297 +0000
@@ -51,7 +51,7 @@
static char *zap_scratch;
static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
-#define TEMP_SIZE (1*SPA_MAXBLOCKSIZE)
+#define TEMP_SIZE (1024 * 1024)
static int zio_read(spa_t *spa, const blkptr_t *bp, void *buf);
@@ -404,7 +404,7 @@
}
static int
-vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp)
+vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t **vdevp, int is_newer)
{
int rc;
uint64_t guid, id, ashift, nparity;
@@ -412,7 +412,8 @@
const char *path;
vdev_t *vdev, *kid;
const unsigned char *kids;
- int nkids, i;
+ int nkids, i, is_new;
+ uint64_t is_offline, is_faulted, is_degraded, is_removed;
if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID,
DATA_TYPE_UINT64, 0, &guid)
@@ -424,17 +425,6 @@
return (ENOENT);
}
- /*
- * Assume that if we've seen this vdev tree before, this one
- * will be identical.
- */
- vdev = vdev_find(guid);
- if (vdev) {
- if (vdevp)
- *vdevp = vdev;
- return (0);
- }
-
if (strcmp(type, VDEV_TYPE_MIRROR)
&& strcmp(type, VDEV_TYPE_DISK)
&& strcmp(type, VDEV_TYPE_RAIDZ)) {
@@ -442,6 +432,22 @@
return (EIO);
}
+ is_offline = is_removed = is_faulted = is_degraded = 0;
+
+ nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, 0,
+ &is_offline);
+ nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, 0,
+ &is_removed);
+ nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, 0,
+ &is_faulted);
+ nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, 0,
+ &is_degraded);
+
+ vdev = vdev_find(guid);
+ if (!vdev) {
+
+ is_new = 1;
+
if (!strcmp(type, VDEV_TYPE_MIRROR))
vdev = vdev_create(guid, vdev_mirror_read);
else if (!strcmp(type, VDEV_TYPE_RAIDZ))
@@ -480,6 +486,41 @@
vdev->v_name = strdup(type);
}
}
+
+ if (is_offline)
+ vdev->v_state = VDEV_STATE_OFFLINE;
+ else if (is_removed)
+ vdev->v_state = VDEV_STATE_REMOVED;
+ else if (is_faulted)
+ vdev->v_state = VDEV_STATE_FAULTED;
+ else if (is_degraded)
+ vdev->v_state = VDEV_STATE_DEGRADED;
+ else
+ vdev->v_state = VDEV_STATE_HEALTHY;
+
+ } else {
+
+ is_new = 0;
+
+ if (is_newer) {
+
+ /* We've already seen this vdev, but from an older
+ * vdev label, so let's refresh its state from the
+ * newer label. */
+
+ if (is_offline)
+ vdev->v_state = VDEV_STATE_OFFLINE;
+ else if (is_removed)
+ vdev->v_state = VDEV_STATE_REMOVED;
+ else if (is_faulted)
+ vdev->v_state = VDEV_STATE_FAULTED;
+ else if (is_degraded)
+ vdev->v_state = VDEV_STATE_DEGRADED;
+ else
+ vdev->v_state = VDEV_STATE_HEALTHY;
+ }
+ }
+
rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN,
DATA_TYPE_NVLIST_ARRAY, &nkids, &kids);
/*
@@ -488,10 +529,12 @@
if (rc == 0) {
vdev->v_nchildren = nkids;
for (i = 0; i < nkids; i++) {
- rc = vdev_init_from_nvlist(kids, &kid);
+ rc = vdev_init_from_nvlist(kids, &kid, is_newer);
if (rc)
return (rc);
- STAILQ_INSERT_TAIL(&vdev->v_children, kid, v_childlink);
+ if (is_new)
+ STAILQ_INSERT_TAIL(&vdev->v_children, kid,
+ v_childlink);
kids = nvlist_next(kids);
}
} else {
@@ -593,7 +636,9 @@
"UNKNOWN",
"CLOSED",
"OFFLINE",
+ "REMOVED",
"CANT_OPEN",
+ "FAULTED",
"DEGRADED",
"ONLINE"
};
@@ -711,7 +756,7 @@
uint64_t pool_txg, pool_guid;
const char *pool_name;
const unsigned char *vdevs;
- int i, rc;
+ int i, rc, is_newer;
char upbuf[1024];
const struct uberblock *up;
@@ -793,12 +838,15 @@
spa = spa_create(pool_guid);
spa->spa_name = strdup(pool_name);
}
- if (pool_txg > spa->spa_txg)
+ if (pool_txg > spa->spa_txg) {
spa->spa_txg = pool_txg;
+ is_newer = 1;
+ } else
+ is_newer = 0;
/*
* Get the vdev tree and create our in-core copy of it.
- * If we already have a healthy vdev with this guid, this must
+ * If we already have a vdev with this guid, this must
* be some kind of alias (overlapping slices, dangerously dedicated
* disks etc).
*/
@@ -808,16 +856,16 @@
return (EIO);
}
vdev = vdev_find(guid);
- if (vdev && vdev->v_state == VDEV_STATE_HEALTHY) {
+ if (vdev && vdev->v_phys_read) /* Has this vdev already been inited? */
return (EIO);
- }
if (nvlist_find(nvlist,
ZPOOL_CONFIG_VDEV_TREE,
DATA_TYPE_NVLIST, 0, &vdevs)) {
return (EIO);
}
- rc = vdev_init_from_nvlist(vdevs, &top_vdev);
+
+ rc = vdev_init_from_nvlist(vdevs, &top_vdev, is_newer);
if (rc)
return (rc);
@@ -838,7 +886,6 @@
if (vdev) {
vdev->v_phys_read = read;
vdev->v_read_priv = read_priv;
- vdev->v_state = VDEV_STATE_HEALTHY;
} else {
printf("ZFS: inconsistent nvlist contents\n");
return (EIO);
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?f383264b1001061411i5feb09f2pdfb9e7b29bdff1c1>
