Date: Fri, 16 Sep 2011 10:20:40 +0300 From: Andriy Gapon <avg@FreeBSD.org> To: freebsd-current@FreeBSD.org, freebsd-fs@FreeBSD.org, Pawel Jakub Dawidek <pjd@FreeBSD.org> Subject: Re: ZFS: i/o error - all block copies unavailable after upgrading to r225312 Message-ID: <4E72F8C8.8070603@FreeBSD.org> In-Reply-To: <4E6F61CA.6020003@FreeBSD.org> References: <20110901223646.14b8aae8@o2.pl> <4E60DBBD.1040703@FreeBSD.org> <4E679D3D.1000007@FreeBSD.org> <4E6F61CA.6020003@FreeBSD.org>
next in thread | previous in thread | raw e-mail | index | archive | help
I've discovered another issue with both the current code and the previous versions of the patch. So another version is at the same URL: http://people.freebsd.org/~avg/zfs-boot-gang.diff The problem was with gang blocks on a raidz vdev. zio_read_gang would pass a NULL bp to vdev_raidz_read, but that function doesn't expect that and really needs a valid bp. Pawel, I also understood the code in zio_read that does size rounding up based on v_ashift. That code is for vdev_raidz_read which needs the size to be multiple of 1 << v_ashift. And, again, this would have posed a problem for zio_read_gang, which always passed SPA_GANGBLOCKSIZE when reading a gang block header. So, a list of fixes and logical changes: - correctly read gang header from raidz [*] - decompress assembled gang block data if compressed - verify checksum of a gang header - verify checksum of assembled gang block data - verify checksum of uber block [*] - new in this version of patch. Description of the code changes: - remove offset parameter from zio_checksum_error This parameter has only been used for the case of verifying checksum of a vdev label. The offset is now passed via DVA offset field in a made up bp pointing to the label. zio_checksum_error now gets all checksum parameters from the bp. - zio_read_gang new uses an artificial bp to read a gang header via zio_read This solves all the problems with gang blocks on raidz as zio_read already has all the code to handle raidz correctly. - zio_read performs size rounding only if v_read == vdev_raidz_read This is to make the intention of the code more clear. And also to slightly optimize non-raidz cases with non-default ashift where an unnecessary intermediate buffer would otherwise be used. Some inline comments (marked with %): Index: sys/cddl/boot/zfs/zfssubr.c =================================================================== --- sys/cddl/boot/zfs/zfssubr.c (revision 225581) +++ sys/cddl/boot/zfs/zfssubr.c (working copy) @@ -181,14 +181,17 @@ } static int -zio_checksum_error(const blkptr_t *bp, void *data, uint64_t offset) +zio_checksum_error(const blkptr_t *bp, void *data) { - unsigned int checksum = BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp); - uint64_t size = BP_GET_PSIZE(bp); + uint64_t size; + unsigned int checksum; zio_checksum_info_t *ci; zio_cksum_t actual_cksum, expected_cksum, verifier; int byteswap; + checksum = BP_GET_CHECKSUM(bp); + size = BP_GET_PSIZE(bp); + % % checksum type and size are always taken from the bp. % Note that BP_IS_GANG(bp) doesn't imply that the caller wants to verify % the gang header (as was assumed before); the caller want want to verify % the whole assembled data as well. % if (checksum >= ZIO_CHECKSUM_FUNCTIONS) return (EINVAL); ci = &zio_checksum_table[checksum]; @@ -206,7 +209,8 @@ if (checksum == ZIO_CHECKSUM_GANG_HEADER) zio_checksum_gang_verifier(&verifier, bp); else if (checksum == ZIO_CHECKSUM_LABEL) - zio_checksum_label_verifier(&verifier, offset); + zio_checksum_label_verifier(&verifier, + DVA_GET_OFFSET(BP_IDENTITY(bp))); % % label offset is taken from the bp now. % else verifier = bp->blk_cksum; @@ -224,7 +228,6 @@ byteswap_uint64_array(&expected_cksum, sizeof (zio_cksum_t)); } else { - ASSERT(!BP_IS_GANG(bp)); % % the assert is no longer valid as we pass a gang block bp % when verifying checksum of assembled data % expected_cksum = bp->blk_cksum; ci->ci_func[0](data, size, &actual_cksum); } @@ -1248,7 +1251,7 @@ raidz_checksum_verify(const blkptr_t *bp, void *data) { - return (zio_checksum_error(bp, data, 0)); + return (zio_checksum_error(bp, data)); } /* Index: sys/boot/zfs/zfsimpl.c =================================================================== --- sys/boot/zfs/zfsimpl.c (revision 225581) +++ sys/boot/zfs/zfsimpl.c (working copy) @@ -347,7 +347,7 @@ rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize); if (rc) return (rc); - if (bp && zio_checksum_error(bp, buf, offset)) + if (bp && zio_checksum_error(bp, buf)) % % the bp can still be NULL here: when raidz code reads blocks of data % it doesn't pass any bp, because there is really no bp for that data. % this should be the only case, in all other cases a valid bp must be provided. % return (EIO); return (0); @@ -800,6 +800,7 @@ BP_SET_PSIZE(&bp, sizeof(vdev_phys_t)); BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL); BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); + DVA_SET_OFFSET(BP_IDENTITY(&bp), off); % % as described above, the offset is now passed via DVA offset field % ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); if (vdev_read_phys(&vtmp, &bp, vdev_label, off, 0)) return (EIO); @@ -941,7 +942,7 @@ BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0); - if (vdev_read_phys(vdev, NULL, upbuf, off, VDEV_UBERBLOCK_SIZE(vdev))) + if (vdev_read_phys(vdev, &bp, upbuf, off, 0)) % % pass the artificial uberblock bp to vdev_read_phys, so that it % can call zio_checksum_error and verify the checksum % continue; if (up->ub_magic != UBERBLOCK_MAGIC) @@ -974,34 +975,39 @@ } static int -zio_read_gang(spa_t *spa, const blkptr_t *bp, const dva_t *dva, void *buf) +zio_read_gang(spa_t *spa, const blkptr_t *bp, void *buf) { + blkptr_t gbh_bp; zio_gbh_phys_t zio_gb; - vdev_t *vdev; - int vdevid; - off_t offset; + char *pbuf; int i; - vdevid = DVA_GET_VDEV(dva); - offset = DVA_GET_OFFSET(dva); - STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) - if (vdev->v_id == vdevid) - break; - if (!vdev || !vdev->v_read) + /* Artificial BP for gang block header. */ + gbh_bp = *bp; + BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE); + BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE); + BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER); + BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF); + for (i = 0; i < SPA_DVAS_PER_BP; i++) + DVA_SET_GANG(&gbh_bp.blk_dva[i], 0); % % the artifical bp for the gang header. % it has PSIZE and LSIZE of SPA_GANGBLOCKSIZE. % cheksum is set ZIO_CHECKSUM_GANG_HEADER, so that zio_checksum_error % does the right thing. % compression is set to ZIO_COMPRESS_OFF, so that zio_read does the right thing. % the gang bit is cleared from the DVAs - we read only the gang header block and % do not want to create endless recursion here. % vdevs and offsets are preserved in the DVAs of the BP. % + + /* Read gang header block using the artificial BP. */ + if (zio_read(spa, &gbh_bp, &zio_gb)) return (EIO); - if (vdev->v_read(vdev, NULL, &zio_gb, offset, SPA_GANGBLOCKSIZE)) - return (EIO); % % so now smarter zio_read can be used instead of dumber v_read. % Which, again, would have crashed on NULL bp if v_read == vdev_raidz_read % + pbuf = buf; % % keep the original buffer pointer, use pbuf to populate the buffer % for (i = 0; i < SPA_GBH_NBLKPTRS; i++) { blkptr_t *gbp = &zio_gb.zg_blkptr[i]; if (BP_IS_HOLE(gbp)) continue; - if (zio_read(spa, gbp, buf)) + if (zio_read(spa, gbp, pbuf)) return (EIO); - buf = (char*)buf + BP_GET_PSIZE(gbp); + pbuf += BP_GET_PSIZE(gbp); } - + + if (zio_checksum_error(bp, buf)) + return (EIO); % % the original pointer is used to verify checksum of the assembled data % of the gang block. % Note: this is where the gang bp is passed to zio_checksum_error. % return (0); } @@ -1024,46 +1030,41 @@ if (!dva->dva_word[0] && !dva->dva_word[1]) continue; - if (DVA_GET_GANG(dva)) { - error = zio_read_gang(spa, bp, dva, buf); - if (error != 0) - continue; - } else { - vdevid = DVA_GET_VDEV(dva); - offset = DVA_GET_OFFSET(dva); - STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) { - if (vdev->v_id == vdevid) - break; - } - if (!vdev || !vdev->v_read) - continue; + vdevid = DVA_GET_VDEV(dva); + offset = DVA_GET_OFFSET(dva); + STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) { + if (vdev->v_id == vdevid) + break; + } + if (!vdev || !vdev->v_read) + continue; - size = BP_GET_PSIZE(bp); + size = BP_GET_PSIZE(bp); + if (vdev->v_read == vdev_raidz_read) { align = 1ULL << vdev->v_top->v_ashift; if (P2PHASE(size, align) != 0) size = P2ROUNDUP(size, align); % % do size rounding up only if v_read == vdev_raidz_read, % because only vdev_raidz_read requires that. % vdev_read_phys requires only 512 byte granularity. % - if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF) - pbuf = zfs_alloc(size); - else - pbuf = buf; + } + if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF) + pbuf = zfs_alloc(size); + else + pbuf = buf; % % So use temporary buffer only if either the block is compressed or % vdev_raidz_read requires a larger buffer % + if (DVA_GET_GANG(dva)) + error = zio_read_gang(spa, bp, pbuf); + else error = vdev->v_read(vdev, bp, pbuf, offset, size); - if (error == 0) { - if (cpfunc != ZIO_COMPRESS_OFF) { - error = zio_decompress_data(cpfunc, - pbuf, BP_GET_PSIZE(bp), buf, - BP_GET_LSIZE(bp)); - } else if (size != BP_GET_PSIZE(bp)) { - bcopy(pbuf, buf, BP_GET_PSIZE(bp)); - } - } - if (buf != pbuf) - zfs_free(pbuf, size); - if (error != 0) - continue; + if (error == 0) { + if (cpfunc != ZIO_COMPRESS_OFF) + error = zio_decompress_data(cpfunc, pbuf, + BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp)); + else if (size != BP_GET_PSIZE(bp)) + bcopy(pbuf, buf, BP_GET_PSIZE(bp)); } % % Decompression is now done for the gang blocks too. % Ditto for the use of larger buffers in the raidz case. % - error = 0; - break; + if (buf != pbuf) + zfs_free(pbuf, size); + if (error == 0) + break; } if (error != 0) printf("ZFS: i/o error - all block copies unavailable\n"); -- Andriy Gapon
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?4E72F8C8.8070603>