Date: Sat, 7 Jan 2012 02:23:58 +0000 (UTC) From: Martin Matuska <mm@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-8@freebsd.org Subject: svn commit: r229760 - in stable/8/sys: boot/zfs cddl/boot/zfs Message-ID: <201201070223.q072Nw0l020225@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: mm Date: Sat Jan 7 02:23:58 2012 New Revision: 229760 URL: http://svn.freebsd.org/changeset/base/229760 Log: MFC r226549,r226550,r226551,r226552,r226553,r226568 MFC r226549 (pjd): Remove redundant size calculation. MFC r226550 (pjd): Initialize 'rc' properly before using it. This error could lead to infinite loop when data reconstruction was needed. MFC r226551 (pjd): Don't mark vdev as healthy too soon, so we won't try to use invalid vdevs. MFC r226552 (pjd): Never pass NULL block pointer when reading. This is neither expected nor handled by lower layers like vdev_raidz, which uses bp for checksum verification. This bug could lead to NULL pointer reference and resets during boot. MFC r226553 (pjd): Always pass data size for checksum verification function, as using physical block size declared in bp may not always be what we want. For example in case of gang block header physical block size declared in bp is much larger than SPA_GANGBLOCKSIZE (512 bytes) and checksum calculation failed. This bug could lead to accessing unallocated memory and resets/failures during boot. MFC r226568 (pjd) [1]: - Correctly read gang header from raidz. - Decompress assembled gang block data if compressed. - Verify checksum of a gang header. - Verify checksum of assembled gang block data. - Verify checksum of uber block. Submitted by: avg [1] Modified: stable/8/sys/boot/zfs/zfsimpl.c stable/8/sys/cddl/boot/zfs/zfssubr.c Directory Properties: stable/8/sys/ (props changed) Modified: stable/8/sys/boot/zfs/zfsimpl.c ============================================================================== --- stable/8/sys/boot/zfs/zfsimpl.c Sat Jan 7 02:09:49 2012 (r229759) +++ stable/8/sys/boot/zfs/zfsimpl.c Sat Jan 7 02:23:58 2012 (r229760) @@ -347,7 +347,7 @@ vdev_read_phys(vdev_t *vdev, const blkpt rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize); if (rc) return (rc); - if (bp && zio_checksum_error(bp, buf, offset)) + if (bp && zio_checksum_verify(bp, buf)) return (EIO); return (0); @@ -543,8 +543,6 @@ vdev_init_from_nvlist(const unsigned cha vdev->v_state = VDEV_STATE_DEGRADED; else if (isnt_present) vdev->v_state = VDEV_STATE_CANT_OPEN; - else - vdev->v_state = VDEV_STATE_HEALTHY; } rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, @@ -800,6 +798,7 @@ vdev_probe(vdev_phys_read_t *read, void BP_SET_PSIZE(&bp, sizeof(vdev_phys_t)); BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); + DVA_SET_OFFSET(BP_IDENTITY(&bp), off); ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0)) return (EIO); @@ -912,6 +911,7 @@ vdev_probe(vdev_phys_read_t *read, void if (vdev) { vdev->v_phys_read = read; vdev->v_read_priv = read_priv; + vdev->v_state = VDEV_STATE_HEALTHY; } else { printf("ZFS: inconsistent nvlist contents\n"); return (EIO); @@ -941,7 +941,7 @@ vdev_probe(vdev_phys_read_t *read, void BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); - if (vdev_read_phys(vdev, NULL, upbuf, off, VDEV_UBERBLOCK_SIZE(vdev))) + if (vdev_read_phys(vdev, &bp, upbuf, off, 0)) continue; if (up->ub_magic != UBERBLOCK_MAGIC) @@ -974,34 +974,39 @@ ilog2(int n) } static int -zio_read_gang(spa_t *spa, const blkptr_t *bp, const dva_t *dva, void *buf) +zio_read_gang(spa_t *spa, const blkptr_t *bp, void *buf) { + blkptr_t gbh_bp; zio_gbh_phys_t zio_gb; - vdev_t *vdev; - int vdevid; - off_t offset; + char *pbuf; int i; - vdevid = DVA_GET_VDEV(dva); - offset = DVA_GET_OFFSET(dva); - STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) - if (vdev->v_id == vdevid) - break; - if (!vdev || !vdev->v_read) - return (EIO); - if (vdev->v_read(vdev, NULL, &zio_gb, offset, SPA_GANGBLOCKSIZE)) + /* Artificial BP for gang block header. */ + gbh_bp = *bp; + BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE); + BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE); + BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER); + BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF); + for (i = 0; i < SPA_DVAS_PER_BP; i++) + DVA_SET_GANG(&gbh_bp.blk_dva[i], 0); + + /* Read gang header block using the artificial BP. */ + if (zio_read(spa, &gbh_bp, &zio_gb)) return (EIO); + pbuf = buf; for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { blkptr_t *gbp = &zio_gb.zg_blkptr[i]; if (BP_IS_HOLE(gbp)) continue; - if (zio_read(spa, gbp, buf)) + if (zio_read(spa, gbp, pbuf)) return (EIO); - buf = (char*)buf + BP_GET_PSIZE(gbp); + pbuf += BP_GET_PSIZE(gbp); } - + + if (zio_checksum_verify(bp, buf)) + return (EIO); return (0); } @@ -1024,46 +1029,41 @@ zio_read(spa_t *spa, const blkptr_t *bp, if (!dva->dva_word[0] && !dva->dva_word[1]) continue; - if (DVA_GET_GANG(dva)) { - error = zio_read_gang(spa, bp, dva, buf); - if (error != 0) - continue; - } else { - vdevid = DVA_GET_VDEV(dva); - offset = DVA_GET_OFFSET(dva); - STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) { - if (vdev->v_id == vdevid) - break; - } - if (!vdev || !vdev->v_read) - continue; + vdevid = DVA_GET_VDEV(dva); + offset = DVA_GET_OFFSET(dva); + STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) { + if (vdev->v_id == vdevid) + break; + } + if (!vdev || !vdev->v_read) + continue; - size = BP_GET_PSIZE(bp); + size = BP_GET_PSIZE(bp); + if (vdev->v_read == vdev_raidz_read) { align = 1ULL << vdev->v_top->v_ashift; if (P2PHASE(size, align) != 0) size = P2ROUNDUP(size, align); - if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF) - pbuf = zfs_alloc(size); - else - pbuf = buf; + } + if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF) + pbuf = zfs_alloc(size); + else + pbuf = buf; + if (DVA_GET_GANG(dva)) + error = zio_read_gang(spa, bp, pbuf); + else error = vdev->v_read(vdev, bp, pbuf, offset, size); - if (error == 0) { - if (cpfunc != ZIO_COMPRESS_OFF) { - error = zio_decompress_data(cpfunc, - pbuf, BP_GET_PSIZE(bp), buf, - BP_GET_LSIZE(bp)); - } else if (size != BP_GET_PSIZE(bp)) { - bcopy(pbuf, buf, BP_GET_PSIZE(bp)); - } - } - if (buf != pbuf) - zfs_free(pbuf, size); - if (error != 0) - continue; + if (error == 0) { + if (cpfunc != ZIO_COMPRESS_OFF) + error = zio_decompress_data(cpfunc, pbuf, + BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp)); + else if (size != BP_GET_PSIZE(bp)) + bcopy(pbuf, buf, BP_GET_PSIZE(bp)); } - error = 0; - break; + if (buf != pbuf) + zfs_free(pbuf, size); + if (error == 0) + break; } if (error != 0) printf("ZFS: i/o error - all block copies unavailable\n"); Modified: stable/8/sys/cddl/boot/zfs/zfssubr.c ============================================================================== --- stable/8/sys/cddl/boot/zfs/zfssubr.c Sat Jan 7 02:09:49 2012 (r229759) +++ stable/8/sys/cddl/boot/zfs/zfssubr.c Sat Jan 7 02:23:58 2012 (r229760) @@ -181,14 +181,17 @@ zio_checksum_label_verifier(zio_cksum_t } static int -zio_checksum_error(const blkptr_t *bp, void *data, uint64_t offset) +zio_checksum_verify(const blkptr_t *bp, void *data) { - unsigned int checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp); - uint64_t size = BP_GET_PSIZE(bp); + uint64_t size; + unsigned int checksum; zio_checksum_info_t *ci; zio_cksum_t actual_cksum, expected_cksum, verifier; int byteswap; + checksum = BP_GET_CHECKSUM(bp); + size = BP_GET_PSIZE(bp); + if (checksum >= ZIO_CHECKSUM_FUNCTIONS) return (EINVAL); ci = &zio_checksum_table[checksum]; @@ -206,7 +209,8 @@ zio_checksum_error(const blkptr_t *bp, v if (checksum == ZIO_CHECKSUM_GANG_HEADER) zio_checksum_gang_verifier(&verifier, bp); else if (checksum == ZIO_CHECKSUM_LABEL) - zio_checksum_label_verifier(&verifier, offset); + zio_checksum_label_verifier(&verifier, + DVA_GET_OFFSET(BP_IDENTITY(bp))); else verifier = bp->blk_cksum; @@ -224,7 +228,6 @@ zio_checksum_error(const blkptr_t *bp, v byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t)); } else { - ASSERT(!BP_IS_GANG(bp)); expected_cksum = bp->blk_cksum; ci->ci_func[0](data, size, &actual_cksum); } @@ -1215,15 +1218,10 @@ static void vdev_raidz_map_free(raidz_map_t *rm) { int c; - size_t size; for (c = rm->rm_firstdatacol - 1; c >= 0; c--) zfs_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); - size = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) - size += rm->rm_col[c].rc_size; - zfs_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); } @@ -1245,10 +1243,10 @@ vdev_child(vdev_t *pvd, uint64_t devidx) * any ereports we generate can note it. */ static int -raidz_checksum_verify(const blkptr_t *bp, void *data) +raidz_checksum_verify(const blkptr_t *bp, void *data, uint64_t size) { - return (zio_checksum_error(bp, data, 0)); + return (zio_checksum_verify(bp, data)); } /* @@ -1298,7 +1296,7 @@ raidz_parity_verify(raidz_map_t *rm) */ static int vdev_raidz_combrec(raidz_map_t *rm, const blkptr_t *bp, void *data, - off_t offset, int total_errors, int data_errors) + off_t offset, uint64_t bytes, int total_errors, int data_errors) { raidz_col_t *rc; void *orig[VDEV_RAIDZ_MAXPARITY]; @@ -1377,7 +1375,7 @@ vdev_raidz_combrec(raidz_map_t *rm, cons * success. */ code = vdev_raidz_reconstruct(rm, tgts, n); - if (raidz_checksum_verify(bp, data) == 0) { + if (raidz_checksum_verify(bp, data, bytes) == 0) { for (i = 0; i < n; i++) { c = tgts[i]; rc = &rm->rm_col[c]; @@ -1548,7 +1546,7 @@ reconstruct: */ if (total_errors <= rm->rm_firstdatacol - parity_untried) { if (data_errors == 0) { - if (raidz_checksum_verify(bp, data) == 0) { + if (raidz_checksum_verify(bp, data, bytes) == 0) { /* * If we read parity information (unnecessarily * as it happens since no reconstruction was @@ -1593,7 +1591,7 @@ reconstruct: code = vdev_raidz_reconstruct(rm, tgts, n); - if (raidz_checksum_verify(bp, data) == 0) { + if (raidz_checksum_verify(bp, data, bytes) == 0) { /* * If we read more parity disks than were used * for reconstruction, confirm that the other @@ -1633,7 +1631,9 @@ reconstruct: n = 0; for (c = 0; c < rm->rm_cols; c++) { - if (rm->rm_col[c].rc_tried) + rc = &rm->rm_col[c]; + + if (rc->rc_tried) continue; cvd = vdev_child(vd, rc->rc_devidx); @@ -1665,8 +1665,8 @@ reconstruct: if (total_errors > rm->rm_firstdatacol) { error = EIO; } else if (total_errors < rm->rm_firstdatacol && - (code = vdev_raidz_combrec(rm, bp, data, offset, total_errors, - data_errors)) != 0) { + (code = vdev_raidz_combrec(rm, bp, data, offset, bytes, + total_errors, data_errors)) != 0) { /* * If we didn't use all the available parity for the * combinatorial reconstruction, verify that the remaining
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201201070223.q072Nw0l020225>