From owner-svn-src-stable-12@freebsd.org Sun Dec 22 08:22:03 2019 Return-Path: Delivered-To: svn-src-stable-12@mailman.nyi.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2610:1c1:1:606c::19:1]) by mailman.nyi.freebsd.org (Postfix) with ESMTP id 1D3371CCCCE; Sun, 22 Dec 2019 08:22:03 +0000 (UTC) (envelope-from tsoome@FreeBSD.org) Received: from mxrelay.nyi.freebsd.org (mxrelay.nyi.freebsd.org [IPv6:2610:1c1:1:606c::19:3]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) server-signature RSA-PSS (4096 bits) client-signature RSA-PSS (4096 bits) client-digest SHA256) (Client CN "mxrelay.nyi.freebsd.org", Issuer "Let's Encrypt Authority X3" (verified OK)) by mx1.freebsd.org (Postfix) with ESMTPS id 47gb7H0k95z49fg; Sun, 22 Dec 2019 08:22:03 +0000 (UTC) (envelope-from tsoome@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mxrelay.nyi.freebsd.org (Postfix) with ESMTPS id 13CA424BF7; Sun, 22 Dec 2019 08:22:03 +0000 (UTC) (envelope-from tsoome@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id xBM8M3X1035916; Sun, 22 Dec 2019 08:22:03 GMT (envelope-from tsoome@FreeBSD.org) Received: (from tsoome@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id xBM8M3vH035915; Sun, 22 Dec 2019 08:22:03 GMT (envelope-from tsoome@FreeBSD.org) Message-Id: <201912220822.xBM8M3vH035915@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: tsoome set sender to tsoome@FreeBSD.org using -f From: Toomas Soome Date: Sun, 22 Dec 2019 08:22:03 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-12@freebsd.org Subject: svn commit: r356003 - in stable/12: stand/libsa/zfs sys/cddl/boot/zfs X-SVN-Group: stable-12 X-SVN-Commit-Author: tsoome X-SVN-Commit-Paths: in stable/12: stand/libsa/zfs sys/cddl/boot/zfs X-SVN-Commit-Revision: 356003 X-SVN-Commit-Repository: base MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-stable-12@freebsd.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: SVN commit messages for only the 12-stable src tree List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sun, 22 Dec 2019 08:22:03 -0000 Author: tsoome Date: Sun Dec 22 08:22:02 2019 New Revision: 356003 URL: https://svnweb.freebsd.org/changeset/base/356003 Log: MFC r354283, r354323, r354363, r354364, r354593, r355773, r355786: loader: we do not support booting from pool with log device loader: factor out label and uberblock load from vdev_probe, add MMP checks loader: populate nvl with data even when label_txg is 0 loader: clean up the noise around log device loader: memory leak in vdev_label_read_config() loader: zfsimpl.c cstyle cleanup loader: rewrite zfs vdev initialization In some cases the pool discovery will get stuck in infinite loop while setting up the vdev children. To fix, we split the vdev setup into two parts, first we create vdevs based on configuration we do get from pool label, then, we process pool config from MOS and update the pool config if needed. This patch bundle is work leading to and including fix for issue when in some cases the pool configuration build does end up in infinite loop. PR: 241118 Reported by: Ryan Moeller Modified: stable/12/stand/libsa/zfs/zfsimpl.c stable/12/sys/cddl/boot/zfs/zfsimpl.h stable/12/sys/cddl/boot/zfs/zfssubr.c Directory Properties: stable/12/ (props changed) Modified: stable/12/stand/libsa/zfs/zfsimpl.c ============================================================================== --- stable/12/stand/libsa/zfs/zfsimpl.c Sun Dec 22 06:56:44 2019 (r356002) +++ stable/12/stand/libsa/zfs/zfsimpl.c Sun Dec 22 08:22:02 2019 (r356003) @@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include "zfsimpl.h" #include "zfssubr.c" @@ -246,7 +247,7 @@ nvlist_find(const unsigned char *nvlist, const char *n const char *pairname; xdr_int(&p, &namelen); - pairname = (const char*) p; + pairname = (const char *)p; p += roundup(namelen, 4); xdr_int(&p, &pairtype); @@ -260,12 +261,12 @@ nvlist_find(const unsigned char *nvlist, const char *n } else if (type == DATA_TYPE_STRING) { int len; xdr_int(&p, &len); - (*(const char**) valuep) = (const char*) p; + (*(const char **)valuep) = (const char *)p; return (0); - } else if (type == DATA_TYPE_NVLIST - || type == DATA_TYPE_NVLIST_ARRAY) { - (*(const unsigned char**) valuep) = - (const unsigned char*) p; + } else if (type == DATA_TYPE_NVLIST || + type == DATA_TYPE_NVLIST_ARRAY) { + (*(const unsigned char **)valuep) = + (const unsigned char *)p; return (0); } else { return (EIO); @@ -310,7 +311,7 @@ nvlist_check_features_for_read(const unsigned char *nv found = 0; xdr_int(&p, &namelen); - pairname = (const char*) p; + pairname = (const char *)p; p += roundup(namelen, 4); xdr_int(&p, &pairtype); @@ -416,7 +417,7 @@ nvlist_print(const unsigned char *nvlist, unsigned int const char *pairname; xdr_int(&p, &namelen); - pairname = (const char*) p; + pairname = (const char *)p; p += roundup(namelen, 4); xdr_int(&p, &pairtype); @@ -491,12 +492,12 @@ vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void /*printf("ZFS: reading %zu bytes at 0x%jx to %p\n", psize, (uintmax_t)offset, buf);*/ rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize); - if (rc) - return (rc); - if (bp != NULL) - return (zio_checksum_verify(vdev->spa, bp, buf)); + if (rc == 0) { + if (bp != NULL) + rc = zio_checksum_verify(vdev->v_spa, bp, buf); + } - return (0); + return (rc); } typedef struct remap_segment { @@ -564,6 +565,7 @@ vdev_indirect_mapping_open(spa_t *spa, objset_phys_t * vim->vim_havecounts = (vim->vim_dn->dn_bonuslen > VDEV_INDIRECT_MAPPING_SIZE_V0); + return (vim); } @@ -774,8 +776,10 @@ static vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { vdev_t *rvd; + vdev_list_t *vlist; - STAILQ_FOREACH(rvd, &spa->spa_vdevs, v_childlink) + vlist = &spa->spa_root_vdev->v_children; + STAILQ_FOREACH(rvd, vlist, v_childlink) if (rvd->v_id == vdev) break; @@ -838,7 +842,7 @@ static void vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void *arg) { list_t stack; - spa_t *spa = vd->spa; + spa_t *spa = vd->v_spa; zio_t *zio = arg; remap_segment_t *rs; @@ -896,7 +900,6 @@ vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint6 */ if (zio->io_error != 0) break; - rs->rs_offset += inner_size; rs->rs_asize -= inner_size; rs->rs_split_offset += inner_size; @@ -932,19 +935,20 @@ static int vdev_indirect_read(vdev_t *vdev, const blkptr_t *bp, void *buf, off_t offset, size_t bytes) { - zio_t zio = { 0 }; - spa_t *spa = vdev->spa; - indirect_vsd_t *iv = malloc(sizeof (*iv)); + zio_t zio; + spa_t *spa = vdev->v_spa; + indirect_vsd_t *iv; indirect_split_t *first; int rc = EIO; + iv = calloc(1, sizeof(*iv)); if (iv == NULL) return (ENOMEM); - bzero(iv, sizeof (*iv)); list_create(&iv->iv_splits, sizeof (indirect_split_t), offsetof(indirect_split_t, is_node)); + bzero(&zio, sizeof(zio)); zio.io_spa = spa; zio.io_bp = (blkptr_t *)bp; zio.io_data = buf; @@ -1083,39 +1087,72 @@ vdev_create(uint64_t guid, vdev_read_t *_read) vdev_t *vdev; vdev_indirect_config_t *vic; - vdev = malloc(sizeof(vdev_t)); - memset(vdev, 0, sizeof(vdev_t)); - STAILQ_INIT(&vdev->v_children); - vdev->v_guid = guid; - vdev->v_state = VDEV_STATE_OFFLINE; - vdev->v_read = _read; + vdev = calloc(1, sizeof(vdev_t)); + if (vdev != NULL) { + STAILQ_INIT(&vdev->v_children); + vdev->v_guid = guid; + vdev->v_read = _read; - vic = &vdev->vdev_indirect_config; - vic->vic_prev_indirect_vdev = UINT64_MAX; - STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink); + /* + * root vdev has no read function. + * We only point root vdev from spa. + */ + if (_read != NULL) { + vic = &vdev->vdev_indirect_config; + vic->vic_prev_indirect_vdev = UINT64_MAX; + STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink); + } + } return (vdev); } +static void +vdev_set_initial_state(vdev_t *vdev, const unsigned char *nvlist) +{ + uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present; + uint64_t is_log; + + is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0; + is_log = 0; + (void) nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL, + &is_offline); + (void) nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL, + &is_removed); + (void) nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL, + &is_faulted); + (void) nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, + NULL, &is_degraded); + (void) nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, + NULL, &isnt_present); + (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, NULL, + &is_log); + + if (is_offline != 0) + vdev->v_state = VDEV_STATE_OFFLINE; + else if (is_removed != 0) + vdev->v_state = VDEV_STATE_REMOVED; + else if (is_faulted != 0) + vdev->v_state = VDEV_STATE_FAULTED; + else if (is_degraded != 0) + vdev->v_state = VDEV_STATE_DEGRADED; + else if (isnt_present != 0) + vdev->v_state = VDEV_STATE_CANT_OPEN; + + vdev->v_islog = is_log == 1; +} + static int -vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev, - vdev_t **vdevp, int is_newer) +vdev_init(uint64_t guid, const unsigned char *nvlist, vdev_t **vdevp) { - int rc; - uint64_t guid, id, ashift, asize, nparity; - const char *type; + uint64_t id, ashift, asize, nparity; const char *path; - vdev_t *vdev, *kid; - const unsigned char *kids; - int nkids, i, is_new; - uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present; + const char *type; + vdev_t *vdev; - if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, - NULL, &guid) - || nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id) - || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING, + if (nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id) || + nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING, NULL, &type)) { - printf("ZFS: can't find vdev details\n"); return (ENOENT); } @@ -1131,139 +1168,224 @@ vdev_init_from_nvlist(const unsigned char *nvlist, vde return (EIO); } - is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0; + if (strcmp(type, VDEV_TYPE_MIRROR) == 0) + vdev = vdev_create(guid, vdev_mirror_read); + else if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) + vdev = vdev_create(guid, vdev_raidz_read); + else if (strcmp(type, VDEV_TYPE_REPLACING) == 0) + vdev = vdev_create(guid, vdev_replacing_read); + else if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) { + vdev_indirect_config_t *vic; - nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL, - &is_offline); - nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL, - &is_removed); - nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL, - &is_faulted); - nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, NULL, - &is_degraded); - nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, NULL, - &isnt_present); - - vdev = vdev_find(guid); - if (!vdev) { - is_new = 1; - - if (!strcmp(type, VDEV_TYPE_MIRROR)) - vdev = vdev_create(guid, vdev_mirror_read); - else if (!strcmp(type, VDEV_TYPE_RAIDZ)) - vdev = vdev_create(guid, vdev_raidz_read); - else if (!strcmp(type, VDEV_TYPE_REPLACING)) - vdev = vdev_create(guid, vdev_replacing_read); - else if (!strcmp(type, VDEV_TYPE_INDIRECT)) { - vdev_indirect_config_t *vic; - - vdev = vdev_create(guid, vdev_indirect_read); + vdev = vdev_create(guid, vdev_indirect_read); + if (vdev != NULL) { vdev->v_state = VDEV_STATE_HEALTHY; vic = &vdev->vdev_indirect_config; nvlist_find(nvlist, - ZPOOL_CONFIG_INDIRECT_OBJECT, DATA_TYPE_UINT64, + ZPOOL_CONFIG_INDIRECT_OBJECT, + DATA_TYPE_UINT64, NULL, &vic->vic_mapping_object); nvlist_find(nvlist, - ZPOOL_CONFIG_INDIRECT_BIRTHS, DATA_TYPE_UINT64, + ZPOOL_CONFIG_INDIRECT_BIRTHS, + DATA_TYPE_UINT64, NULL, &vic->vic_births_object); nvlist_find(nvlist, - ZPOOL_CONFIG_PREV_INDIRECT_VDEV, DATA_TYPE_UINT64, + ZPOOL_CONFIG_PREV_INDIRECT_VDEV, + DATA_TYPE_UINT64, NULL, &vic->vic_prev_indirect_vdev); - } else - vdev = vdev_create(guid, vdev_disk_read); - - vdev->v_id = id; - vdev->v_top = pvdev != NULL ? pvdev : vdev; - if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT, - DATA_TYPE_UINT64, NULL, &ashift) == 0) { - vdev->v_ashift = ashift; - } else { - vdev->v_ashift = 0; } - if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE, - DATA_TYPE_UINT64, NULL, &asize) == 0) { - vdev->v_psize = asize + - VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; - } - if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY, - DATA_TYPE_UINT64, NULL, &nparity) == 0) { - vdev->v_nparity = nparity; - } else { - vdev->v_nparity = 0; - } - if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH, - DATA_TYPE_STRING, NULL, &path) == 0) { - if (strncmp(path, "/dev/", 5) == 0) - path += 5; - vdev->v_name = strdup(path); - } else { - char *name; + } else { + vdev = vdev_create(guid, vdev_disk_read); + } - if (!strcmp(type, "raidz")) { - if (vdev->v_nparity < 1 || - vdev->v_nparity > 3) { - printf("ZFS: can only boot from disk, " - "mirror, raidz1, raidz2 and raidz3 " - "vdevs\n"); - return (EIO); - } - asprintf(&name, "%s%d-%jd", type, - vdev->v_nparity, id); - } else { - asprintf(&name, "%s-%jd", type, id); + if (vdev == NULL) + return (ENOMEM); + + vdev_set_initial_state(vdev, nvlist); + vdev->v_id = id; + if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT, + DATA_TYPE_UINT64, NULL, &ashift) == 0) + vdev->v_ashift = ashift; + + if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE, + DATA_TYPE_UINT64, NULL, &asize) == 0) { + vdev->v_psize = asize + + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + } + + if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY, + DATA_TYPE_UINT64, NULL, &nparity) == 0) + vdev->v_nparity = nparity; + + if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH, + DATA_TYPE_STRING, NULL, &path) == 0) { + if (strncmp(path, "/dev/", 5) == 0) + path += 5; + vdev->v_name = strdup(path); + } else { + char *name; + + name = NULL; + if (strcmp(type, "raidz") == 0) { + if (vdev->v_nparity < 1 || + vdev->v_nparity > 3) { + printf("ZFS: can only boot from disk, " + "mirror, raidz1, raidz2 and raidz3 " + "vdevs\n"); + return (EIO); } - if (name == NULL) - return (ENOMEM); - vdev->v_name = name; + (void) asprintf(&name, "%s%d-%" PRIu64, type, + vdev->v_nparity, id); + } else { + (void) asprintf(&name, "%s-%" PRIu64, type, id); } - } else { - is_new = 0; + vdev->v_name = name; } + *vdevp = vdev; + return (0); +} - if (is_new || is_newer) { - /* - * This is either new vdev or we've already seen this vdev, - * but from an older vdev label, so let's refresh its state - * from the newer label. - */ - if (is_offline) - vdev->v_state = VDEV_STATE_OFFLINE; - else if (is_removed) - vdev->v_state = VDEV_STATE_REMOVED; - else if (is_faulted) - vdev->v_state = VDEV_STATE_FAULTED; - else if (is_degraded) - vdev->v_state = VDEV_STATE_DEGRADED; - else if (isnt_present) - vdev->v_state = VDEV_STATE_CANT_OPEN; +/* + * Find slot for vdev. We return either NULL to signal to use + * STAILQ_INSERT_HEAD, or we return link element to be used with + * STAILQ_INSERT_AFTER. + */ +static vdev_t * +vdev_find_previous(vdev_t *top_vdev, vdev_t *vdev) +{ + vdev_t *v, *previous; + + if (STAILQ_EMPTY(&top_vdev->v_children)) + return (NULL); + + previous = NULL; + STAILQ_FOREACH(v, &top_vdev->v_children, v_childlink) { + if (v->v_id > vdev->v_id) + return (previous); + + if (v->v_id == vdev->v_id) + return (v); + + if (v->v_id < vdev->v_id) + previous = v; } + return (previous); +} - rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, - &nkids, &kids); +static size_t +vdev_child_count(vdev_t *vdev) +{ + vdev_t *v; + size_t count; + + count = 0; + STAILQ_FOREACH(v, &vdev->v_children, v_childlink) { + count++; + } + return (count); +} + +/* + * Insert vdev into top_vdev children list. List is ordered by v_id. + */ +static void +vdev_insert(vdev_t *top_vdev, vdev_t *vdev) +{ + vdev_t *previous; + size_t count; + /* - * Its ok if we don't have any kids. + * The top level vdev can appear in random order, depending how + * the firmware is presenting the disk devices. + * However, we will insert vdev to create list ordered by v_id, + * so we can use either STAILQ_INSERT_HEAD or STAILQ_INSERT_AFTER + * as STAILQ does not have insert before. */ + previous = vdev_find_previous(top_vdev, vdev); + + if (previous == NULL) { + STAILQ_INSERT_HEAD(&top_vdev->v_children, vdev, v_childlink); + count = vdev_child_count(top_vdev); + if (top_vdev->v_nchildren < count) + top_vdev->v_nchildren = count; + return; + } + + if (previous->v_id == vdev->v_id) + return; + + STAILQ_INSERT_AFTER(&top_vdev->v_children, previous, vdev, v_childlink); + count = vdev_child_count(top_vdev); + if (top_vdev->v_nchildren < count) + top_vdev->v_nchildren = count; +} + +static int +vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const unsigned char *nvlist) +{ + vdev_t *top_vdev, *vdev; + const unsigned char *kids; + int rc, nkids; + + /* Get top vdev. */ + top_vdev = vdev_find(top_guid); + if (top_vdev == NULL) { + rc = vdev_init(top_guid, nvlist, &top_vdev); + if (rc != 0) + return (rc); + top_vdev->v_spa = spa; + top_vdev->v_top = top_vdev; + vdev_insert(spa->spa_root_vdev, top_vdev); + } + + /* Add children if there are any. */ + rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, + &nkids, &kids); if (rc == 0) { - vdev->v_nchildren = nkids; - for (i = 0; i < nkids; i++) { - rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer); - if (rc) + for (int i = 0; i < nkids; i++) { + uint64_t guid; + + rc = nvlist_find(kids, ZPOOL_CONFIG_GUID, + DATA_TYPE_UINT64, NULL, &guid); + if (rc != 0) return (rc); - if (is_new) - STAILQ_INSERT_TAIL(&vdev->v_children, kid, - v_childlink); + rc = vdev_init(guid, kids, &vdev); + if (rc != 0) + return (rc); + + vdev->v_spa = spa; + vdev->v_top = top_vdev; + vdev_insert(top_vdev, vdev); + kids = nvlist_next(kids); } } else { - vdev->v_nchildren = 0; + rc = 0; } - if (vdevp) - *vdevp = vdev; - return (0); + return (rc); } +static int +vdev_init_from_label(spa_t *spa, const unsigned char *nvlist) +{ + uint64_t pool_guid, top_guid; + const unsigned char *vdevs; + + if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, + NULL, &pool_guid) || + nvlist_find(nvlist, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64, + NULL, &top_guid) || + nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, + NULL, &vdevs)) { + printf("ZFS: can't find vdev details\n"); + return (ENOENT); + } + + return (vdev_from_nvlist(spa, top_guid, vdevs)); +} + static void vdev_set_state(vdev_t *vdev) { @@ -1271,6 +1393,10 @@ vdev_set_state(vdev_t *vdev) int good_kids; int bad_kids; + STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { + vdev_set_state(kid); + } + /* * A mirror or raidz is healthy if all its kids are healthy. A * mirror is degraded if any of its kids is healthy; a raidz @@ -1305,6 +1431,104 @@ vdev_set_state(vdev_t *vdev) } } +static int +vdev_update_from_nvlist(uint64_t top_guid, const unsigned char *nvlist) +{ + vdev_t *vdev; + const unsigned char *kids; + int rc, nkids; + + /* Update top vdev. */ + vdev = vdev_find(top_guid); + if (vdev != NULL) + vdev_set_initial_state(vdev, nvlist); + + /* Update children if there are any. */ + rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, + &nkids, &kids); + if (rc == 0) { + for (int i = 0; i < nkids; i++) { + uint64_t guid; + + rc = nvlist_find(kids, ZPOOL_CONFIG_GUID, + DATA_TYPE_UINT64, NULL, &guid); + if (rc != 0) + break; + + vdev = vdev_find(guid); + if (vdev != NULL) + vdev_set_initial_state(vdev, kids); + + kids = nvlist_next(kids); + } + } else { + rc = 0; + } + + return (rc); +} + +static int +vdev_init_from_nvlist(spa_t *spa, const unsigned char *nvlist) +{ + uint64_t pool_guid, vdev_children; + const unsigned char *vdevs, *kids; + int rc, nkids; + + if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, + NULL, &pool_guid) || + nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_CHILDREN, DATA_TYPE_UINT64, + NULL, &vdev_children) || + nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, + NULL, &vdevs)) { + printf("ZFS: can't find vdev details\n"); + return (ENOENT); + } + + /* Wrong guid?! */ + if (spa->spa_guid != pool_guid) + return (EIO); + + spa->spa_root_vdev->v_nchildren = vdev_children; + + rc = nvlist_find(vdevs, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, + &nkids, &kids); + + /* + * MOS config has at least one child for root vdev. + */ + if (rc != 0) + return (EIO); + + for (int i = 0; i < nkids; i++) { + uint64_t guid; + vdev_t *vdev; + + rc = nvlist_find(kids, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, + NULL, &guid); + if (rc != 0) + break; + vdev = vdev_find(guid); + /* + * Top level vdev is missing, create it. + */ + if (vdev == NULL) + rc = vdev_from_nvlist(spa, guid, kids); + else + rc = vdev_update_from_nvlist(guid, kids); + if (rc != 0) + break; + kids = nvlist_next(kids); + } + + /* + * Re-evaluate top-level vdev state. + */ + vdev_set_state(spa->spa_root_vdev); + + return (rc); +} + static spa_t * spa_find_by_guid(uint64_t guid) { @@ -1347,7 +1571,7 @@ spa_get_primary_vdev(const spa_t *spa) spa = spa_get_primary(); if (spa == NULL) return (NULL); - vdev = STAILQ_FIRST(&spa->spa_vdevs); + vdev = spa->spa_root_vdev; if (vdev == NULL) return (NULL); for (kid = STAILQ_FIRST(&vdev->v_children); kid != NULL; @@ -1368,8 +1592,14 @@ spa_create(uint64_t guid, const char *name) free(spa); return (NULL); } - STAILQ_INIT(&spa->spa_vdevs); spa->spa_guid = guid; + spa->spa_root_vdev = vdev_create(guid, NULL); + if (spa->spa_root_vdev == NULL) { + free(spa->spa_name); + free(spa); + return (NULL); + } + spa->spa_root_vdev->v_name = strdup("root"); STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link); return (spa); @@ -1404,9 +1634,8 @@ pager_printf(const char *fmt, ...) va_list args; va_start(args, fmt); - vsprintf(line, fmt, args); + vsnprintf(line, sizeof(line), fmt, args); va_end(args); - return (pager_output(line)); } @@ -1417,14 +1646,13 @@ pager_printf(const char *fmt, ...) static int print_state(int indent, const char *name, vdev_state_t state) { - char buf[512]; int i; + char buf[512]; buf[0] = 0; for (i = 0; i < indent; i++) strcat(buf, " "); strcat(buf, name); - return (pager_printf(STATUS_FORMAT, buf, state_name(state))); } @@ -1433,6 +1661,12 @@ vdev_status(vdev_t *vdev, int indent) { vdev_t *kid; int ret; + + if (vdev->v_islog) { + (void)pager_output(" logs\n"); + indent++; + } + ret = print_state(indent, vdev->v_name, vdev->v_state); if (ret != 0) return (ret); @@ -1450,6 +1684,7 @@ spa_status(spa_t *spa) { static char bootfs[ZFS_MAXNAMELEN]; uint64_t rootid; + vdev_list_t *vlist; vdev_t *vdev; int good_kids, bad_kids, degraded_kids, ret; vdev_state_t state; @@ -1478,7 +1713,8 @@ spa_status(spa_t *spa) good_kids = 0; degraded_kids = 0; bad_kids = 0; - STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) { + vlist = &spa->spa_root_vdev->v_children; + STAILQ_FOREACH(vdev, vlist, v_childlink) { if (vdev->v_state == VDEV_STATE_HEALTHY) good_kids++; else if (vdev->v_state == VDEV_STATE_DEGRADED) @@ -1496,7 +1732,8 @@ spa_status(spa_t *spa) ret = print_state(0, spa->spa_name, state); if (ret != 0) return (ret); - STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) { + + STAILQ_FOREACH(vdev, vlist, v_childlink) { ret = vdev_status(vdev, 1); if (ret != 0) return (ret); @@ -1538,71 +1775,104 @@ vdev_label_offset(uint64_t psize, int l, uint64_t offs } static int -vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap) +vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) { - vdev_t vtmp; - vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch; - vdev_phys_t *tmp_label; - spa_t *spa; - vdev_t *vdev, *top_vdev, *pool_vdev; - off_t off; + unsigned int seq1 = 0; + unsigned int seq2 = 0; + int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg); + + if (cmp != 0) + return (cmp); + + cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp); + if (cmp != 0) + return (cmp); + + if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1)) + seq1 = MMP_SEQ(ub1); + + if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2)) + seq2 = MMP_SEQ(ub2); + + return (AVL_CMP(seq1, seq2)); +} + +static int +uberblock_verify(uberblock_t *ub) +{ + if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) { + byteswap_uint64_array(ub, sizeof (uberblock_t)); + } + + if (ub->ub_magic != UBERBLOCK_MAGIC || + !SPA_VERSION_IS_SUPPORTED(ub->ub_version)) + return (EINVAL); + + return (0); +} + +static int +vdev_label_read(vdev_t *vd, int l, void *buf, uint64_t offset, + size_t size) +{ blkptr_t bp; - const unsigned char *nvlist = NULL; - uint64_t val; - uint64_t guid; - uint64_t best_txg = 0; - uint64_t pool_txg, pool_guid; - const char *pool_name; - const unsigned char *vdevs; - const unsigned char *features; - int i, l, rc, is_newer; - char *upbuf; - const struct uberblock *up; + off_t off; - /* - * Load the vdev label and figure out which - * uberblock is most current. - */ - memset(&vtmp, 0, sizeof(vtmp)); - vtmp.v_phys_read = _read; - vtmp.v_read_priv = read_priv; - vtmp.v_psize = P2ALIGN(ldi_get_size(read_priv), - (uint64_t)sizeof (vdev_label_t)); + off = vdev_label_offset(vd->v_psize, l, offset); - /* Test for minimum pool size. */ - if (vtmp.v_psize < SPA_MINDEVSIZE) - return (EIO); + BP_ZERO(&bp); + BP_SET_LSIZE(&bp, size); + BP_SET_PSIZE(&bp, size); + BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); + BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); + DVA_SET_OFFSET(BP_IDENTITY(&bp), off); + ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); - tmp_label = zfs_alloc(sizeof(vdev_phys_t)); + return (vdev_read_phys(vd, &bp, buf, off, size)); +} - for (l = 0; l < VDEV_LABELS; l++) { - off = vdev_label_offset(vtmp.v_psize, l, - offsetof(vdev_label_t, vl_vdev_phys)); +static unsigned char * +vdev_label_read_config(vdev_t *vd, uint64_t txg) +{ + vdev_phys_t *label; + uint64_t best_txg = 0; + uint64_t label_txg = 0; + uint64_t asize; + unsigned char *nvl; + size_t nvl_size; + int error; - BP_ZERO(&bp); - BP_SET_LSIZE(&bp, sizeof(vdev_phys_t)); - BP_SET_PSIZE(&bp, sizeof(vdev_phys_t)); - BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); - BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); - DVA_SET_OFFSET(BP_IDENTITY(&bp), off); - ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); + label = malloc(sizeof (vdev_phys_t)); + if (label == NULL) + return (NULL); - if (vdev_read_phys(&vtmp, &bp, tmp_label, off, 0)) - continue; + nvl_size = VDEV_PHYS_SIZE - sizeof (zio_eck_t) - 4; + nvl = malloc(nvl_size); + if (nvl == NULL) + goto done; - if (tmp_label->vp_nvlist[0] != NV_ENCODE_XDR) + for (int l = 0; l < VDEV_LABELS; l++) { + const unsigned char *nvlist; + + if (vdev_label_read(vd, l, label, + offsetof(vdev_label_t, vl_vdev_phys), + sizeof (vdev_phys_t))) continue; - nvlist = (const unsigned char *) tmp_label->vp_nvlist + 4; - if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, - DATA_TYPE_UINT64, NULL, &pool_txg) != 0) + if (label->vp_nvlist[0] != NV_ENCODE_XDR) continue; - if (best_txg <= pool_txg) { - uint64_t asize; + nvlist = (const unsigned char *) label->vp_nvlist + 4; + error = nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, + DATA_TYPE_UINT64, NULL, &label_txg); + if (error != 0 || label_txg == 0) { + memcpy(nvl, nvlist, nvl_size); + goto done; + } - best_txg = pool_txg; - memcpy(vdev_label, tmp_label, sizeof (vdev_phys_t)); + if (label_txg <= txg && label_txg > best_txg) { + best_txg = label_txg; + memcpy(nvl, nvlist, nvl_size); /* * Use asize from pool config. We need this @@ -1610,30 +1880,88 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, s */ if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE, DATA_TYPE_UINT64, NULL, &asize) == 0) { - vtmp.v_psize = asize + + vd->v_psize = asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; } } } - zfs_free(tmp_label, sizeof (vdev_phys_t)); + if (best_txg == 0) { + free(nvl); + nvl = NULL; + } +done: + free(label); + return (nvl); +} - if (best_txg == 0) +static void +vdev_uberblock_load(vdev_t *vd, uberblock_t *ub) +{ + uberblock_t *buf; + + buf = malloc(VDEV_UBERBLOCK_SIZE(vd)); + if (buf == NULL) + return; + + for (int l = 0; l < VDEV_LABELS; l++) { + for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { + if (vdev_label_read(vd, l, buf, + VDEV_UBERBLOCK_OFFSET(vd, n), + VDEV_UBERBLOCK_SIZE(vd))) + continue; + if (uberblock_verify(buf) != 0) + continue; + + if (vdev_uberblock_compare(buf, ub) > 0) + *ub = *buf; + } + } + free(buf); +} + +static int +vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap) +{ + vdev_t vtmp; + spa_t *spa; + vdev_t *vdev; + unsigned char *nvlist; + uint64_t val; + uint64_t guid, vdev_children; + uint64_t pool_txg, pool_guid; + const char *pool_name; + const unsigned char *features; + int rc; + + /* + * Load the vdev label and figure out which + * uberblock is most current. + */ + memset(&vtmp, 0, sizeof(vtmp)); + vtmp.v_phys_read = _read; + vtmp.v_read_priv = read_priv; + vtmp.v_psize = P2ALIGN(ldi_get_size(read_priv), + (uint64_t)sizeof (vdev_label_t)); + + /* Test for minimum device size. */ + if (vtmp.v_psize < SPA_MINDEVSIZE) return (EIO); *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***