Date: Sun, 3 Nov 2019 21:19:52 +0000 (UTC) From: Toomas Soome <tsoome@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r354323 - in head: stand/libsa/zfs sys/cddl/boot/zfs Message-ID: <201911032119.xA3LJqFb078312@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: tsoome Date: Sun Nov 3 21:19:52 2019 New Revision: 354323 URL: https://svnweb.freebsd.org/changeset/base/354323 Log: loader: factor out label and uberblock load from vdev_probe, add MMP checks Clean up the label read. Modified: head/stand/libsa/zfs/zfsimpl.c head/sys/cddl/boot/zfs/zfsimpl.h Modified: head/stand/libsa/zfs/zfsimpl.c ============================================================================== --- head/stand/libsa/zfs/zfsimpl.c Sun Nov 3 21:17:50 2019 (r354322) +++ head/stand/libsa/zfs/zfsimpl.c Sun Nov 3 21:19:52 2019 (r354323) @@ -1549,71 +1549,104 @@ vdev_label_offset(uint64_t psize, int l, uint64_t offs } static int -vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap) +vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) { - vdev_t vtmp; - vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch; - vdev_phys_t *tmp_label; - spa_t *spa; - vdev_t *vdev, *top_vdev, *pool_vdev; - off_t off; + unsigned int seq1 = 0; + unsigned int seq2 = 0; + int cmp = AVL_CMP(ub1->ub_txg, ub2->ub_txg); + + if (cmp != 0) + return (cmp); + + cmp = AVL_CMP(ub1->ub_timestamp, ub2->ub_timestamp); + if (cmp != 0) + return (cmp); + + if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1)) + seq1 = MMP_SEQ(ub1); + + if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2)) + seq2 = MMP_SEQ(ub2); + + return (AVL_CMP(seq1, seq2)); +} + +static int +uberblock_verify(uberblock_t *ub) +{ + if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) { + byteswap_uint64_array(ub, sizeof (uberblock_t)); + } + + if (ub->ub_magic != UBERBLOCK_MAGIC || + !SPA_VERSION_IS_SUPPORTED(ub->ub_version)) + return (EINVAL); + + return (0); +} + +static int +vdev_label_read(vdev_t *vd, int l, void *buf, uint64_t offset, + size_t size) +{ blkptr_t bp; - const unsigned char *nvlist = NULL; - uint64_t val; - uint64_t guid; - uint64_t best_txg = 0; - uint64_t pool_txg, pool_guid; - const char *pool_name; - const unsigned char *vdevs; - const unsigned char *features; - int i, l, rc, is_newer; - char *upbuf; - const struct uberblock *up; + off_t off; - /* - * Load the vdev label and figure out which - * uberblock is most current. - */ - memset(&vtmp, 0, sizeof(vtmp)); - vtmp.v_phys_read = _read; - vtmp.v_read_priv = read_priv; - vtmp.v_psize = P2ALIGN(ldi_get_size(read_priv), - (uint64_t)sizeof (vdev_label_t)); + off = vdev_label_offset(vd->v_psize, l, offset); - /* Test for minimum pool size. */ - if (vtmp.v_psize < SPA_MINDEVSIZE) - return (EIO); + BP_ZERO(&bp); + BP_SET_LSIZE(&bp, size); + BP_SET_PSIZE(&bp, size); + BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); + BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); + DVA_SET_OFFSET(BP_IDENTITY(&bp), off); + ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); - tmp_label = zfs_alloc(sizeof(vdev_phys_t)); + return (vdev_read_phys(vd, &bp, buf, off, size)); +} - for (l = 0; l < VDEV_LABELS; l++) { - off = vdev_label_offset(vtmp.v_psize, l, - offsetof(vdev_label_t, vl_vdev_phys)); +static unsigned char * +vdev_label_read_config(vdev_t *vd, uint64_t txg) +{ + vdev_phys_t *label; + uint64_t best_txg = 0; + uint64_t label_txg = 0; + uint64_t asize; + unsigned char *nvl; + size_t nvl_size; + int error; - BP_ZERO(&bp); - BP_SET_LSIZE(&bp, sizeof(vdev_phys_t)); - BP_SET_PSIZE(&bp, sizeof(vdev_phys_t)); - BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); - BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); - DVA_SET_OFFSET(BP_IDENTITY(&bp), off); - ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); + label = malloc(sizeof (vdev_phys_t)); + if (label == NULL) + return (NULL); - if (vdev_read_phys(&vtmp, &bp, tmp_label, off, 0)) - continue; + nvl_size = VDEV_PHYS_SIZE - sizeof (zio_eck_t) - 4; + nvl = malloc(nvl_size); + if (nvl == NULL) { + free(label); + return (NULL); + } - if (tmp_label->vp_nvlist[0] != NV_ENCODE_XDR) + for (int l = 0; l < VDEV_LABELS; l++) { + const unsigned char *nvlist; + + if (vdev_label_read(vd, l, label, + offsetof(vdev_label_t, vl_vdev_phys), + sizeof (vdev_phys_t))) continue; - nvlist = (const unsigned char *) tmp_label->vp_nvlist + 4; - if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, - DATA_TYPE_UINT64, NULL, &pool_txg) != 0) + if (label->vp_nvlist[0] != NV_ENCODE_XDR) continue; - if (best_txg <= pool_txg) { - uint64_t asize; + nvlist = (const unsigned char *) label->vp_nvlist + 4; + error = nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, + DATA_TYPE_UINT64, NULL, &label_txg); + if (error != 0 || label_txg == 0) + return (nvl); - best_txg = pool_txg; - memcpy(vdev_label, tmp_label, sizeof (vdev_phys_t)); + if (label_txg <= txg && label_txg > best_txg) { + best_txg = label_txg; + memcpy(nvl, nvlist, nvl_size); /* * Use asize from pool config. We need this @@ -1621,30 +1654,87 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, s */ if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE, DATA_TYPE_UINT64, NULL, &asize) == 0) { - vtmp.v_psize = asize + + vd->v_psize = asize + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; } } } - zfs_free(tmp_label, sizeof (vdev_phys_t)); + if (best_txg == 0) { + free(nvl); + nvl = NULL; + } + return (nvl); +} - if (best_txg == 0) +static void +vdev_uberblock_load(vdev_t *vd, uberblock_t *ub) +{ + uberblock_t *buf; + + buf = malloc(VDEV_UBERBLOCK_SIZE(vd)); + if (buf == NULL) + return; + + for (int l = 0; l < VDEV_LABELS; l++) { + for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { + if (vdev_label_read(vd, l, buf, + VDEV_UBERBLOCK_OFFSET(vd, n), + VDEV_UBERBLOCK_SIZE(vd))) + continue; + if (uberblock_verify(buf) != 0) + continue; + + if (vdev_uberblock_compare(buf, ub) > 0) + *ub = *buf; + } + } + free(buf); +} + +static int +vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap) +{ + vdev_t vtmp; + spa_t *spa; + vdev_t *vdev, *top_vdev, *pool_vdev; + unsigned char *nvlist; + uint64_t val; + uint64_t guid; + uint64_t pool_txg, pool_guid; + const char *pool_name; + const unsigned char *vdevs; + const unsigned char *features; + int rc, is_newer; + + /* + * Load the vdev label and figure out which + * uberblock is most current. + */ + memset(&vtmp, 0, sizeof(vtmp)); + vtmp.v_phys_read = _read; + vtmp.v_read_priv = read_priv; + vtmp.v_psize = P2ALIGN(ldi_get_size(read_priv), + (uint64_t)sizeof (vdev_label_t)); + + /* Test for minimum device size. */ + if (vtmp.v_psize < SPA_MINDEVSIZE) return (EIO); - if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR) + nvlist = vdev_label_read_config(&vtmp, UINT64_MAX); + if (nvlist == NULL) return (EIO); - nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4; - if (nvlist_find(nvlist, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64, NULL, &val) != 0) { + free(nvlist); return (EIO); } if (!SPA_VERSION_IS_SUPPORTED(val)) { printf("ZFS: unsupported ZFS version %u (should be %u)\n", (unsigned) val, (unsigned) SPA_VERSION); + free(nvlist); return (EIO); } @@ -1652,16 +1742,19 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, s if (nvlist_find(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ, DATA_TYPE_NVLIST, NULL, &features) == 0 && nvlist_check_features_for_read(features) != 0) { + free(nvlist); return (EIO); } if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64, NULL, &val) != 0) { + free(nvlist); return (EIO); } if (val == POOL_STATE_DESTROYED) { /* We don't boot only from destroyed pools. */ + free(nvlist); return (EIO); } @@ -1675,12 +1768,13 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, s * Cache and spare devices end up here - just ignore * them. */ - /*printf("ZFS: can't find pool details\n");*/ + free(nvlist); return (EIO); } if (nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, NULL, &val) == 0 && val != 0) { + free(nvlist); return (EIO); } @@ -1690,8 +1784,10 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, s spa = spa_find_by_guid(pool_guid); if (spa == NULL) { spa = spa_create(pool_guid, pool_name); - if (spa == NULL) + if (spa == NULL) { + free(nvlist); return (ENOMEM); + } } if (pool_txg > spa->spa_txg) { spa->spa_txg = pool_txg; @@ -1708,18 +1804,24 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, s */ if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, NULL, &guid) != 0) { + free(nvlist); return (EIO); } vdev = vdev_find(guid); - if (vdev && vdev->v_phys_read) /* Has this vdev already been inited? */ + /* Has this vdev already been inited? */ + if (vdev && vdev->v_phys_read) { + free(nvlist); return (EIO); + } if (nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, NULL, &vdevs)) { + free(nvlist); return (EIO); } rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer); + free(nvlist); if (rc != 0) return (rc); @@ -1729,6 +1831,7 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, s STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink) if (top_vdev == pool_vdev) break; + if (!pool_vdev && top_vdev) { top_vdev->spa = spa; STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink); @@ -1765,36 +1868,7 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, s * the best uberblock and then we can actually access * the contents of the pool. */ - upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev)); - up = (const struct uberblock *)upbuf; - for (l = 0; l < VDEV_LABELS; l++) { - for (i = 0; i < VDEV_UBERBLOCK_COUNT(vdev); i++) { - off = vdev_label_offset(vdev->v_psize, l, - VDEV_UBERBLOCK_OFFSET(vdev, i)); - BP_ZERO(&bp); - DVA_SET_OFFSET(&bp.blk_dva[0], off); - BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev)); - BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev)); - BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); - BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); - ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); - - if (vdev_read_phys(vdev, &bp, upbuf, off, 0)) - continue; - - if (up->ub_magic != UBERBLOCK_MAGIC) - continue; - if (up->ub_txg < spa->spa_txg) - continue; - if (up->ub_txg > spa->spa_uberblock.ub_txg || - (up->ub_txg == spa->spa_uberblock.ub_txg && - up->ub_timestamp > - spa->spa_uberblock.ub_timestamp)) { - spa->spa_uberblock = *up; - } - } - } - zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev)); + vdev_uberblock_load(vdev, &spa->spa_uberblock); vdev->spa = spa; if (spap != NULL) Modified: head/sys/cddl/boot/zfs/zfsimpl.h ============================================================================== --- head/sys/cddl/boot/zfs/zfsimpl.h Sun Nov 3 21:17:50 2019 (r354322) +++ head/sys/cddl/boot/zfs/zfsimpl.h Sun Nov 3 21:19:52 2019 (r354323) @@ -63,6 +63,14 @@ #define _NOTE(s) +/* + * AVL comparator helpers + */ +#define AVL_ISIGN(a) (((a) > 0) - ((a) < 0)) +#define AVL_CMP(a, b) (((a) > (b)) - ((a) < (b))) +#define AVL_PCMP(a, b) \ + (((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b))) + typedef enum { B_FALSE, B_TRUE } boolean_t; /* CRC64 table */ @@ -490,8 +498,16 @@ typedef struct zio_gbh { #define VDEV_PHYS_SIZE (112 << 10) #define VDEV_UBERBLOCK_RING (128 << 10) +/* + * MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock + * ring when MMP is enabled. + */ +#define MMP_BLOCKS_PER_LABEL 1 + +/* The largest uberblock we support is 8k. */ +#define MAX_UBERBLOCK_SHIFT (13) #define VDEV_UBERBLOCK_SHIFT(vd) \ - MAX((vd)->v_top->v_ashift, UBERBLOCK_SHIFT) + MIN(MAX((vd)->v_top->v_ashift, UBERBLOCK_SHIFT), MAX_UBERBLOCK_SHIFT) #define VDEV_UBERBLOCK_COUNT(vd) \ (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd)) #define VDEV_UBERBLOCK_OFFSET(vd, n) \ @@ -841,14 +857,88 @@ typedef enum pool_state { #define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */ #define UBERBLOCK_SHIFT 10 /* up to 1K */ -struct uberblock { +#define MMP_MAGIC 0xa11cea11 /* all-see-all */ + +#define MMP_INTERVAL_VALID_BIT 0x01 +#define MMP_SEQ_VALID_BIT 0x02 +#define MMP_FAIL_INT_VALID_BIT 0x04 + +#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \ + ubp->ub_mmp_magic == MMP_MAGIC) +#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ + MMP_INTERVAL_VALID_BIT)) +#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ + MMP_SEQ_VALID_BIT)) +#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ + MMP_FAIL_INT_VALID_BIT)) + +#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \ + >> 8) +#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \ + >> 32) +#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \ + >> 48) + +typedef struct uberblock { uint64_t ub_magic; /* UBERBLOCK_MAGIC */ uint64_t ub_version; /* SPA_VERSION */ uint64_t ub_txg; /* txg of last sync */ uint64_t ub_guid_sum; /* sum of all vdev guids */ uint64_t ub_timestamp; /* UTC time of last sync */ blkptr_t ub_rootbp; /* MOS objset_phys_t */ -}; + /* highest SPA_VERSION supported by software that wrote this txg */ + uint64_t ub_software_version; + /* Maybe missing in uberblocks we read, but always written */ + uint64_t ub_mmp_magic; + /* + * If ub_mmp_delay == 0 and ub_mmp_magic is valid, MMP is off. + * Otherwise, nanosec since last MMP write. + */ + uint64_t ub_mmp_delay; + + /* + * The ub_mmp_config contains the multihost write interval, multihost + * fail intervals, sequence number for sub-second granularity, and + * valid bit mask. This layout is as follows: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | Fail Intervals| Seq | Write Interval (ms) | VALID | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * This allows a write_interval of (2^24/1000)s, over 4.5 hours + * + * VALID Bits: + * - 0x01 - Write Interval (ms) + * - 0x02 - Sequence number exists + * - 0x04 - Fail Intervals + * - 0xf8 - Reserved + */ + uint64_t ub_mmp_config; + + /* + * ub_checkpoint_txg indicates two things about the current uberblock: + * + * 1] If it is not zero then this uberblock is a checkpoint. If it is + * zero, then this uberblock is not a checkpoint. + * + * 2] On checkpointed uberblocks, the value of ub_checkpoint_txg is + * the ub_txg that the uberblock had at the time we moved it to + * the MOS config. + * + * The field is set when we checkpoint the uberblock and continues to + * hold that value even after we've rewound (unlike the ub_txg that + * is reset to a higher value). + * + * Besides checks used to determine whether we are reopening the + * pool from a checkpointed uberblock [see spa_ld_select_uberblock()], + * the value of the field is used to determine which ZIL blocks have + * been allocated according to the ms_sm when we are rewinding to a + * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then + * the ZIL block is not allocated [see uses of spa_min_claim_txg()]. + */ + uint64_t ub_checkpoint_txg; +} uberblock_t; /* * Flags.
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201911032119.xA3LJqFb078312>