Date: Sat, 2 Aug 2014 03:56:07 +0000 (UTC) From: Xin LI <delphij@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-10@freebsd.org Subject: svn commit: r269416 - in stable/10: cddl/contrib/opensolaris/cmd/ztest sys/cddl/contrib/opensolaris/uts/common/fs/zfs sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys Message-ID: <201408020356.s723u7ZL001571@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: delphij Date: Sat Aug 2 03:56:06 2014 New Revision: 269416 URL: http://svnweb.freebsd.org/changeset/base/269416 Log: MFC r268855: MFV r268848: Instead of asserting all zio's be properly aligned, only assert on the logical ones. Cap uberblocks at 8k, otherwise with ashift=17, there would be only one uberblock. This fixes a problem that zdb would trip assert on pools with ashift >= 0xe (8k). While there, also change the code so it only attempt to condense space map unless the uncondensed size consumes greater than zfs_metaslab_condense_block_threshold blocks. Illumos issue: 4958 zdb trips assert on pools with ashift >= 0xe Modified: stable/10/cddl/contrib/opensolaris/cmd/ztest/ztest.c stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c Directory Properties: stable/10/ (props changed) Modified: stable/10/cddl/contrib/opensolaris/cmd/ztest/ztest.c ============================================================================== --- stable/10/cddl/contrib/opensolaris/cmd/ztest/ztest.c Sat Aug 2 03:48:16 2014 (r269415) +++ stable/10/cddl/contrib/opensolaris/cmd/ztest/ztest.c Sat Aug 2 03:56:06 2014 (r269416) @@ -810,7 +810,7 @@ static uint64_t ztest_get_ashift(void) { if (ztest_opts.zo_ashift == 0) - return (SPA_MINBLOCKSHIFT + ztest_random(3)); + return (SPA_MINBLOCKSHIFT + ztest_random(5)); return (ztest_opts.zo_ashift); } @@ -969,11 +969,28 @@ ztest_random_spa_version(uint64_t initia return (version); } +/* + * Find the largest ashift used + */ +static uint64_t +ztest_spa_get_ashift() { + uint64_t i; + uint64_t ashift = SPA_MINBLOCKSHIFT; + vdev_t *rvd = ztest_spa->spa_root_vdev; + + for (i = 0; i < rvd->vdev_children; i++) { + ashift = MAX(ashift, rvd->vdev_child[i]->vdev_ashift); + } + return (ashift); +} + static int ztest_random_blocksize(void) { - return (1 << (SPA_MINBLOCKSHIFT + - ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1))); + // Choose a block size >= the ashift. + uint64_t block_shift = + ztest_random(SPA_MAXBLOCKSHIFT - ztest_spa_get_ashift() + 1); + return (1 << (SPA_MINBLOCKSHIFT + block_shift)); } static int @@ -5768,16 +5785,30 @@ ztest_freeze(void) spa_freeze(spa); /* + * Because it is hard to predict how much space a write will actually + * require beforehand, we leave ourselves some fudge space to write over + * capacity. + */ + uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; + + /* * Run tests that generate log records but don't alter the pool config * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). * We do a txg_wait_synced() after each iteration to force the txg * to increase well beyond the last synced value in the uberblock. * The ZIL should be OK with that. + * + * Run a random number of times less than zo_maxloops and ensure we do + * not run out of space on the pool. */ while (ztest_random(10) != 0 && - numloops++ < ztest_opts.zo_maxloops) { - ztest_dmu_write_parallel(zd, 0); - ztest_dmu_object_alloc_free(zd, 0); + numloops++ < ztest_opts.zo_maxloops && + metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { + ztest_od_t od; + ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0); + VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); + ztest_io(zd, od.od_object, + ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); txg_wait_synced(spa_get_dsl(spa), 0); } Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c ============================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c Sat Aug 2 03:48:16 2014 (r269415) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c Sat Aug 2 03:56:06 2014 (r269416) @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. */ Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c ============================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c Sat Aug 2 03:48:16 2014 (r269415) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c Sat Aug 2 03:56:06 2014 (r269416) @@ -74,6 +74,21 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_ " of in-memory counterpart"); /* + * Condensing a metaslab is not guaranteed to actually reduce the amount of + * space used on disk. In particular, a space map uses data in increments of + * MAX(1 << ashift, SPACE_MAP_INITIAL_BLOCKSIZE), so a metaslab might use the + * same number of blocks after condensing. Since the goal of condensing is to + * reduce the number of IOPs required to read the space map, we only want to + * condense when we can be sure we will reduce the number of blocks used by the + * space map. Unfortunately, we cannot precisely compute whether or not this is + * the case in metaslab_should_condense since we are holding ms_lock. Instead, + * we apply the following heuristic: do not condense a spacemap unless the + * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold + * blocks. + */ +int zfs_metaslab_condense_block_threshold = 4; + +/* * The zfs_mg_noalloc_threshold defines which metaslab groups should * be eligible for allocation. The value is defined as a percentage of * a free space. Metaslab groups that have more free space than @@ -1371,6 +1386,8 @@ metaslab_group_preload(metaslab_group_t * times the size than the free space range tree representation * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB). * + * 3. The on-disk size of the space map should actually decrease. + * * Checking the first condition is tricky since we don't want to walk * the entire AVL tree calculating the estimated on-disk size. Instead we * use the size-ordered range tree in the metaslab and calculate the @@ -1381,13 +1398,21 @@ metaslab_group_preload(metaslab_group_t * To determine the second criterion we use a best-case estimate and assume * each segment can be represented on-disk as a single 64-bit entry. We refer * to this best-case estimate as the space map's minimal form. + * + * Unfortunately, we cannot compute the on-disk size of the space map in this + * context because we cannot accurately compute the effects of compression, etc. + * Instead, we apply the heuristic described in the block comment for + * zfs_metaslab_condense_block_threshold - we only condense if the space used + * is greater than a threshold number of blocks. */ static boolean_t metaslab_should_condense(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; range_seg_t *rs; - uint64_t size, entries, segsz; + uint64_t size, entries, segsz, object_size, optimal_size, record_size; + dmu_object_info_t doi; + uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift; ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loaded); @@ -1411,9 +1436,15 @@ metaslab_should_condense(metaslab_t *msp entries = size / (MIN(size, SM_RUN_MAX)); segsz = entries * sizeof (uint64_t); - return (segsz <= space_map_length(msp->ms_sm) && - space_map_length(msp->ms_sm) >= (zfs_condense_pct * - sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root)) / 100); + optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root); + object_size = space_map_length(msp->ms_sm); + + dmu_object_info_from_db(sm->sm_dbuf, &doi); + record_size = MAX(doi.doi_data_block_size, vdev_blocksize); + + return (segsz <= object_size && + object_size >= (optimal_size * zfs_condense_pct / 100) && + object_size > zfs_metaslab_condense_block_threshold * record_size); } /* Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c ============================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c Sat Aug 2 03:48:16 2014 (r269415) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c Sat Aug 2 03:56:06 2014 (r269416) @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved. */ Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h ============================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h Sat Aug 2 03:48:16 2014 (r269415) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h Sat Aug 2 03:56:06 2014 (r269416) @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #ifndef _SYS_VDEV_IMPL_H @@ -249,8 +249,11 @@ struct vdev { #define VDEV_PHYS_SIZE (112 << 10) #define VDEV_UBERBLOCK_RING (128 << 10) +/* The largest uberblock we support is 8k. */ +#define MAX_UBERBLOCK_SHIFT (13) #define VDEV_UBERBLOCK_SHIFT(vd) \ - MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT) + MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \ + MAX_UBERBLOCK_SHIFT) #define VDEV_UBERBLOCK_COUNT(vd) \ (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd)) #define VDEV_UBERBLOCK_OFFSET(vd, n) \ Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h ============================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h Sat Aug 2 03:48:16 2014 (r269415) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h Sat Aug 2 03:56:06 2014 (r269416) @@ -165,19 +165,20 @@ enum zio_flag { ZIO_FLAG_RESILVER = 1 << 3, ZIO_FLAG_SCRUB = 1 << 4, ZIO_FLAG_SCAN_THREAD = 1 << 5, + ZIO_FLAG_PHYSICAL = 1 << 6, #define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) /* * Flags inherited by ddt, gang, and vdev children. */ - ZIO_FLAG_CANFAIL = 1 << 6, /* must be first for INHERIT */ - ZIO_FLAG_SPECULATIVE = 1 << 7, - ZIO_FLAG_CONFIG_WRITER = 1 << 8, - ZIO_FLAG_DONT_RETRY = 1 << 9, - ZIO_FLAG_DONT_CACHE = 1 << 10, - ZIO_FLAG_NODATA = 1 << 11, - ZIO_FLAG_INDUCE_DAMAGE = 1 << 12, + ZIO_FLAG_CANFAIL = 1 << 7, /* must be first for INHERIT */ + ZIO_FLAG_SPECULATIVE = 1 << 8, + ZIO_FLAG_CONFIG_WRITER = 1 << 9, + ZIO_FLAG_DONT_RETRY = 1 << 10, + ZIO_FLAG_DONT_CACHE = 1 << 11, + ZIO_FLAG_NODATA = 1 << 12, + ZIO_FLAG_INDUCE_DAMAGE = 1 << 13, #define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) #define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) @@ -185,27 +186,27 @@ enum zio_flag { /* * Flags inherited by vdev children. */ - ZIO_FLAG_IO_RETRY = 1 << 13, /* must be first for INHERIT */ - ZIO_FLAG_PROBE = 1 << 14, - ZIO_FLAG_TRYHARD = 1 << 15, - ZIO_FLAG_OPTIONAL = 1 << 16, + ZIO_FLAG_IO_RETRY = 1 << 14, /* must be first for INHERIT */ + ZIO_FLAG_PROBE = 1 << 15, + ZIO_FLAG_TRYHARD = 1 << 16, + ZIO_FLAG_OPTIONAL = 1 << 17, #define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) /* * Flags not inherited by any children. */ - ZIO_FLAG_DONT_QUEUE = 1 << 17, /* must be first for INHERIT */ - ZIO_FLAG_DONT_PROPAGATE = 1 << 18, - ZIO_FLAG_IO_BYPASS = 1 << 19, - ZIO_FLAG_IO_REWRITE = 1 << 20, - ZIO_FLAG_RAW = 1 << 21, - ZIO_FLAG_GANG_CHILD = 1 << 22, - ZIO_FLAG_DDT_CHILD = 1 << 23, - ZIO_FLAG_GODFATHER = 1 << 24, - ZIO_FLAG_NOPWRITE = 1 << 25, - ZIO_FLAG_REEXECUTED = 1 << 26, - ZIO_FLAG_DELEGATED = 1 << 27, + ZIO_FLAG_DONT_QUEUE = 1 << 18, /* must be first for INHERIT */ + ZIO_FLAG_DONT_PROPAGATE = 1 << 19, + ZIO_FLAG_IO_BYPASS = 1 << 20, + ZIO_FLAG_IO_REWRITE = 1 << 21, + ZIO_FLAG_RAW = 1 << 22, + ZIO_FLAG_GANG_CHILD = 1 << 23, + ZIO_FLAG_DDT_CHILD = 1 << 24, + ZIO_FLAG_GODFATHER = 1 << 25, + ZIO_FLAG_NOPWRITE = 1 << 26, + ZIO_FLAG_REEXECUTED = 1 << 27, + ZIO_FLAG_DELEGATED = 1 << 28, }; #define ZIO_FLAG_MUSTSUCCEED 0 Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c ============================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c Sat Aug 2 03:48:16 2014 (r269415) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c Sat Aug 2 03:56:06 2014 (r269416) @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -57,7 +57,10 @@ zfs_dbgmsg_fini(void) * echo ::zfs_dbgmsg | mdb -k * * Monitor these messages by running: - * dtrace -q -n 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}' + * dtrace -qn 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}' + * + * When used with libzpool, monitor with: + * dtrace -qn 'zfs$pid::zfs_dbgmsg:probe1{printf("%s\n", copyinstr(arg1))}' */ void zfs_dbgmsg(const char *fmt, ...) Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c ============================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c Sat Aug 2 03:48:16 2014 (r269415) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c Sat Aug 2 03:56:06 2014 (r269416) @@ -886,8 +886,8 @@ zio_read_phys(zio_t *pio, vdev_t *vd, ui ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, - ZIO_TYPE_READ, priority, flags, vd, offset, NULL, - ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); + ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, + NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; @@ -907,8 +907,8 @@ zio_write_phys(zio_t *pio, vdev_t *vd, u ASSERT3U(offset + size, <=, vd->vdev_psize); zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private, - ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL, - ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); + ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, + NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); zio->io_prop.zp_checksum = checksum; @@ -2621,7 +2621,9 @@ zio_vdev_io_start(zio_t **ziop) align = 1ULL << vd->vdev_top->vdev_ashift; - if (P2PHASE(zio->io_size, align) != 0) { + if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && + P2PHASE(zio->io_size, align) != 0) { + /* Transform logical writes to be a full physical block size. */ uint64_t asize = P2ROUNDUP(zio->io_size, align); char *abuf = NULL; if (zio->io_type == ZIO_TYPE_READ || @@ -2636,8 +2638,22 @@ zio_vdev_io_start(zio_t **ziop) zio_subblock); } - ASSERT(P2PHASE(zio->io_offset, align) == 0); - ASSERT(P2PHASE(zio->io_size, align) == 0); + /* + * If this is not a physical io, make sure that it is properly aligned + * before proceeding. + */ + if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { + ASSERT0(P2PHASE(zio->io_offset, align)); + ASSERT0(P2PHASE(zio->io_size, align)); + } else { + /* + * For physical writes, we allow 512b aligned writes and assume + * the device will perform a read-modify-write as necessary. + */ + ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); + ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); + } + VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa)); /*
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201408020356.s723u7ZL001571>