Date: Thu, 21 Nov 2019 13:35:44 +0000 (UTC) From: Andriy Gapon <avg@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r354948 - in head: cddl/contrib/opensolaris/cmd/zdb sys/cddl/contrib/opensolaris/uts/common/fs/zfs sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys Message-ID: <201911211335.xALDZikp029490@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: avg Date: Thu Nov 21 13:35:43 2019 New Revision: 354948 URL: https://svnweb.freebsd.org/changeset/base/354948 Log: MFV r354383: 10592 misc. metaslab and vdev related ZoL bug fixes illumos/illumos-gate@555d674d5d4b8191dc83723188349d28278b2431 https://github.com/illumos/illumos-gate/commit/555d674d5d4b8191dc83723188349d28278b2431 https://www.illumos.org/issues/10592 This is a collection of recent fixes from ZoL: 8eef997679b Error path in metaslab_load_impl() forgets to drop ms_sync_lock 928e8ad47d3 Introduce auxiliary metaslab histograms 425d3237ee8 Get rid of space_map_update() for ms_synced_length 6c926f426a2 Simplify log vdev removal code 21e7cf5da89 zdb -L should skip leak detection altogether df72b8bebe0 Rename range_tree_verify to range_tree_verify_not_present 75058f33034 Remove unused vdev_t fields Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com> Author: Serapheim Dimitropoulos <serapheim@delphix.com> MFC after: 4 weeks Modified: head/cddl/contrib/opensolaris/cmd/zdb/zdb.8 head/cddl/contrib/opensolaris/cmd/zdb/zdb.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c Directory Properties: head/cddl/contrib/opensolaris/ (props changed) head/cddl/contrib/opensolaris/cmd/zdb/ (props changed) head/sys/cddl/contrib/opensolaris/ (props changed) Modified: head/cddl/contrib/opensolaris/cmd/zdb/zdb.8 ============================================================================== --- head/cddl/contrib/opensolaris/cmd/zdb/zdb.8 Thu Nov 21 13:22:23 2019 (r354947) +++ head/cddl/contrib/opensolaris/cmd/zdb/zdb.8 Thu Nov 21 13:35:43 2019 (r354948) @@ -10,7 +10,7 @@ .\" .\" .\" Copyright 2012, Richard Lowe. -.\" Copyright (c) 2012, 2017 by Delphix. All rights reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. .\" Copyright 2017 Nexenta Systems, Inc. .\" .Dd October 06, 2017 @@ -187,7 +187,7 @@ If the .Fl u option is also specified, also display the uberblocks on this device. .It Fl L -Disable leak tracing and the loading of space maps. +Disable leak detection and the loading of space maps. By default, .Nm verifies that all non-free blocks are referenced, which can be very expensive. Modified: head/cddl/contrib/opensolaris/cmd/zdb/zdb.c ============================================================================== --- head/cddl/contrib/opensolaris/cmd/zdb/zdb.c Thu Nov 21 13:22:23 2019 (r354947) +++ head/cddl/contrib/opensolaris/cmd/zdb/zdb.c Thu Nov 21 13:35:43 2019 (r354948) @@ -785,18 +785,21 @@ dump_spacemap(objset_t *os, space_map_t *sm) return; (void) printf("space map object %llu:\n", - (longlong_t)sm->sm_phys->smp_object); - (void) printf(" smp_objsize = 0x%llx\n", - (longlong_t)sm->sm_phys->smp_objsize); + (longlong_t)sm->sm_object); + (void) printf(" smp_length = 0x%llx\n", + (longlong_t)sm->sm_phys->smp_length); (void) printf(" smp_alloc = 0x%llx\n", (longlong_t)sm->sm_phys->smp_alloc); + if (dump_opt['d'] < 6 && dump_opt['m'] < 4) + return; + /* * Print out the freelist entries in both encoded and decoded form. */ uint8_t mapshift = sm->sm_shift; int64_t alloc = 0; - uint64_t word; + uint64_t word, entry_id = 0; for (uint64_t offset = 0; offset < space_map_length(sm); offset += sizeof (word)) { @@ -804,11 +807,12 @@ dump_spacemap(objset_t *os, space_map_t *sm) sizeof (word), &word, DMU_READ_PREFETCH)); if (sm_entry_is_debug(word)) { - (void) printf("\t [%6llu] %s: txg %llu, pass %llu\n", - (u_longlong_t)(offset / sizeof (word)), + (void) printf("\t [%6llu] %s: txg %llu pass %llu\n", + (u_longlong_t)entry_id, ddata[SM_DEBUG_ACTION_DECODE(word)], (u_longlong_t)SM_DEBUG_TXG_DECODE(word), (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word)); + entry_id++; continue; } @@ -846,7 +850,7 @@ dump_spacemap(objset_t *os, space_map_t *sm) (void) printf("\t [%6llu] %c range:" " %010llx-%010llx size: %06llx vdev: %06llu words: %u\n", - (u_longlong_t)(offset / sizeof (word)), + (u_longlong_t)entry_id, entry_type, (u_longlong_t)entry_off, (u_longlong_t)(entry_off + entry_run), (u_longlong_t)entry_run, @@ -856,8 +860,9 @@ dump_spacemap(objset_t *os, space_map_t *sm) alloc += entry_run; else alloc -= entry_run; + entry_id++; } - if ((uint64_t)alloc != space_map_allocated(sm)) { + if (alloc != space_map_allocated(sm)) { (void) printf("space_map_object alloc (%lld) INCONSISTENT " "with space map summary (%lld)\n", (longlong_t)space_map_allocated(sm), (longlong_t)alloc); @@ -921,11 +926,8 @@ dump_metaslab(metaslab_t *msp) SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); } - if (dump_opt['d'] > 5 || dump_opt['m'] > 3) { - ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); - - dump_spacemap(spa->spa_meta_objset, msp->ms_sm); - } + ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); + dump_spacemap(spa->spa_meta_objset, msp->ms_sm); } static void @@ -3140,6 +3142,8 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) ddt_entry_t dde; int error; + ASSERT(!dump_opt['L']); + bzero(&ddb, sizeof (ddb)); while ((error = ddt_walk(spa, &ddb, &dde)) == 0) { blkptr_t blk; @@ -3163,12 +3167,10 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) zcb->zcb_dedup_blocks++; } } - if (!dump_opt['L']) { - ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; - ddt_enter(ddt); - VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); - ddt_exit(ddt); - } + ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; + ddt_enter(ddt); + VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); + ddt_exit(ddt); } ASSERT(error == ENOENT); @@ -3210,6 +3212,9 @@ claim_segment_cb(void *arg, uint64_t offset, uint64_t static void zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) { + if (dump_opt['L']) + return; + if (spa->spa_vdev_removal == NULL) return; @@ -3301,7 +3306,6 @@ zdb_load_obsolete_counts(vdev_t *vd) space_map_t *prev_obsolete_sm = NULL; VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); - space_map_update(prev_obsolete_sm); vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, prev_obsolete_sm); space_map_close(prev_obsolete_sm); @@ -3395,9 +3399,9 @@ zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_ VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); - space_map_update(checkpoint_sm); VERIFY0(space_map_iterate(checkpoint_sm, + space_map_length(checkpoint_sm), checkpoint_sm_exclude_entry_cb, &cseea)); space_map_close(checkpoint_sm); @@ -3407,6 +3411,8 @@ zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_ static void zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb) { + ASSERT(!dump_opt['L']); + vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id); @@ -3503,6 +3509,8 @@ load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab static void zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb) { + ASSERT(!dump_opt['L']); + vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; @@ -3549,67 +3557,63 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) { zcb->zcb_spa = spa; - if (!dump_opt['L']) { - dsl_pool_t *dp = spa->spa_dsl_pool; - vdev_t *rvd = spa->spa_root_vdev; + if (dump_opt['L']) + return; - /* - * We are going to be changing the meaning of the metaslab's - * ms_allocatable. Ensure that the allocator doesn't try to - * use the tree. - */ - spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; - spa->spa_log_class->mc_ops = &zdb_metaslab_ops; + dsl_pool_t *dp = spa->spa_dsl_pool; + vdev_t *rvd = spa->spa_root_vdev; - zcb->zcb_vd_obsolete_counts = - umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), - UMEM_NOFAIL); + /* + * We are going to be changing the meaning of the metaslab's + * ms_allocatable. Ensure that the allocator doesn't try to + * use the tree. + */ + spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; + spa->spa_log_class->mc_ops = &zdb_metaslab_ops; - /* - * For leak detection, we overload the ms_allocatable trees - * to contain allocated segments instead of free segments. - * As a result, we can't use the normal metaslab_load/unload - * interfaces. - */ - zdb_leak_init_prepare_indirect_vdevs(spa, zcb); - load_concrete_ms_allocatable_trees(spa, SM_ALLOC); + zcb->zcb_vd_obsolete_counts = + umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), + UMEM_NOFAIL); - /* - * On load_concrete_ms_allocatable_trees() we loaded all the - * allocated entries from the ms_sm to the ms_allocatable for - * each metaslab. If the pool has a checkpoint or is in the - * middle of discarding a checkpoint, some of these blocks - * may have been freed but their ms_sm may not have been - * updated because they are referenced by the checkpoint. In - * order to avoid false-positives during leak-detection, we - * go through the vdev's checkpoint space map and exclude all - * its entries from their relevant ms_allocatable. - * - * We also aggregate the space held by the checkpoint and add - * it to zcb_checkpoint_size. - * - * Note that at this point we are also verifying that all the - * entries on the checkpoint_sm are marked as allocated in - * the ms_sm of their relevant metaslab. - * [see comment in checkpoint_sm_exclude_entry_cb()] - */ - zdb_leak_init_exclude_checkpoint(spa, zcb); + /* + * For leak detection, we overload the ms_allocatable trees + * to contain allocated segments instead of free segments. + * As a result, we can't use the normal metaslab_load/unload + * interfaces. + */ + zdb_leak_init_prepare_indirect_vdevs(spa, zcb); + load_concrete_ms_allocatable_trees(spa, SM_ALLOC); - /* for cleaner progress output */ - (void) fprintf(stderr, "\n"); + /* + * On load_concrete_ms_allocatable_trees() we loaded all the + * allocated entries from the ms_sm to the ms_allocatable for + * each metaslab. If the pool has a checkpoint or is in the + * middle of discarding a checkpoint, some of these blocks + * may have been freed but their ms_sm may not have been + * updated because they are referenced by the checkpoint. In + * order to avoid false-positives during leak-detection, we + * go through the vdev's checkpoint space map and exclude all + * its entries from their relevant ms_allocatable. + * + * We also aggregate the space held by the checkpoint and add + * it to zcb_checkpoint_size. + * + * Note that at this point we are also verifying that all the + * entries on the checkpoint_sm are marked as allocated in + * the ms_sm of their relevant metaslab. + * [see comment in checkpoint_sm_exclude_entry_cb()] + */ + zdb_leak_init_exclude_checkpoint(spa, zcb); + ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa)); - if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { - ASSERT(spa_feature_is_enabled(spa, - SPA_FEATURE_DEVICE_REMOVAL)); - (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, - increment_indirect_mapping_cb, zcb, NULL); - } - } else { - /* - * If leak tracing is disabled, we still need to consider - * any checkpointed space in our space verification. - */ - zcb->zcb_checkpoint_size += spa_get_checkpoint_space(spa); + /* for cleaner progress output */ + (void) fprintf(stderr, "\n"); + + if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_DEVICE_REMOVAL)); + (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, + increment_indirect_mapping_cb, zcb, NULL); } spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); @@ -3690,52 +3694,58 @@ zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb static boolean_t zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) { + if (dump_opt['L']) + return (B_FALSE); + boolean_t leaks = B_FALSE; - if (!dump_opt['L']) { - vdev_t *rvd = spa->spa_root_vdev; - for (unsigned c = 0; c < rvd->vdev_children; c++) { - vdev_t *vd = rvd->vdev_child[c]; - metaslab_group_t *mg = vd->vdev_mg; - if (zcb->zcb_vd_obsolete_counts[c] != NULL) { - leaks |= zdb_check_for_obsolete_leaks(vd, zcb); - } + vdev_t *rvd = spa->spa_root_vdev; + for (unsigned c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; +#if DEBUG + metaslab_group_t *mg = vd->vdev_mg; +#endif - for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { - metaslab_t *msp = vd->vdev_ms[m]; - ASSERT3P(mg, ==, msp->ms_group); + if (zcb->zcb_vd_obsolete_counts[c] != NULL) { + leaks |= zdb_check_for_obsolete_leaks(vd, zcb); + } - /* - * ms_allocatable has been overloaded - * to contain allocated segments. Now that - * we finished traversing all blocks, any - * block that remains in the ms_allocatable - * represents an allocated block that we - * did not claim during the traversal. - * Claimed blocks would have been removed - * from the ms_allocatable. For indirect - * vdevs, space remaining in the tree - * represents parts of the mapping that are - * not referenced, which is not a bug. - */ - if (vd->vdev_ops == &vdev_indirect_ops) { - range_tree_vacate(msp->ms_allocatable, - NULL, NULL); - } else { - range_tree_vacate(msp->ms_allocatable, - zdb_leak, vd); - } + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + ASSERT3P(mg, ==, msp->ms_group); - if (msp->ms_loaded) { - msp->ms_loaded = B_FALSE; - } + /* + * ms_allocatable has been overloaded + * to contain allocated segments. Now that + * we finished traversing all blocks, any + * block that remains in the ms_allocatable + * represents an allocated block that we + * did not claim during the traversal. + * Claimed blocks would have been removed + * from the ms_allocatable. For indirect + * vdevs, space remaining in the tree + * represents parts of the mapping that are + * not referenced, which is not a bug. + */ + if (vd->vdev_ops == &vdev_indirect_ops) { + range_tree_vacate(msp->ms_allocatable, + NULL, NULL); + } else { + range_tree_vacate(msp->ms_allocatable, + zdb_leak, vd); } + + if (msp->ms_loaded) { + msp->ms_loaded = B_FALSE; + } } - umem_free(zcb->zcb_vd_obsolete_counts, - rvd->vdev_children * sizeof (uint32_t *)); - zcb->zcb_vd_obsolete_counts = NULL; } + + umem_free(zcb->zcb_vd_obsolete_counts, + rvd->vdev_children * sizeof (uint32_t *)); + zcb->zcb_vd_obsolete_counts = NULL; + return (leaks); } @@ -3774,13 +3784,18 @@ dump_block_stats(spa_t *spa) !dump_opt['L'] ? "nothing leaked " : ""); /* - * Load all space maps as SM_ALLOC maps, then traverse the pool - * claiming each block we discover. If the pool is perfectly - * consistent, the space maps will be empty when we're done. - * Anything left over is a leak; any block we can't claim (because - * it's not part of any space map) is a double allocation, - * reference to a freed block, or an unclaimed log block. + * When leak detection is enabled we load all space maps as SM_ALLOC + * maps, then traverse the pool claiming each block we discover. If + * the pool is perfectly consistent, the segment trees will be empty + * when we're done. Anything left over is a leak; any block we can't + * claim (because it's not part of any space map) is a double + * allocation, reference to a freed block, or an unclaimed log block. + * + * When leak detection is disabled (-L option) we still traverse the + * pool claiming each block we discover, but we skip opening any space + * maps. */ + bzero(&zcb, sizeof (zdb_cb_t)); zdb_leak_init(spa, &zcb); /* @@ -3859,11 +3874,10 @@ dump_block_stats(spa_t *spa) total_found = tzb->zb_asize - zcb.zcb_dedup_asize + zcb.zcb_removing_size + zcb.zcb_checkpoint_size; - if (total_found == total_alloc) { - if (!dump_opt['L']) - (void) printf("\n\tNo leaks (block sum matches space" - " maps exactly)\n"); - } else { + if (total_found == total_alloc && !dump_opt['L']) { + (void) printf("\n\tNo leaks (block sum matches space" + " maps exactly)\n"); + } else if (!dump_opt['L']) { (void) printf("block traversal size %llu != alloc %llu " "(%s %lld)\n", (u_longlong_t)total_found, @@ -4203,7 +4217,6 @@ verify_device_removal_feature_counts(spa_t *spa) spa->spa_meta_objset, scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); - space_map_update(prev_obsolete_sm); dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm); (void) printf("\n"); space_map_close(prev_obsolete_sm); @@ -4409,7 +4422,8 @@ verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, * their respective ms_allocateable trees should not contain them. */ mutex_enter(&ms->ms_lock); - range_tree_verify(ms->ms_allocatable, sme->sme_offset, sme->sme_run); + range_tree_verify_not_present(ms->ms_allocatable, + sme->sme_offset, sme->sme_run); mutex_exit(&ms->ms_lock); return (0); @@ -4472,7 +4486,6 @@ verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, sp VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current), checkpoint_sm_obj, 0, current_vd->vdev_asize, current_vd->vdev_ashift)); - space_map_update(checkpoint_sm); verify_checkpoint_sm_entry_cb_arg_t vcsec; vcsec.vcsec_vd = ckpoint_vd; @@ -4480,6 +4493,7 @@ verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, sp vcsec.vcsec_num_entries = space_map_length(checkpoint_sm) / sizeof (uint64_t); VERIFY0(space_map_iterate(checkpoint_sm, + space_map_length(checkpoint_sm), verify_checkpoint_sm_entry_cb, &vcsec)); dump_spacemap(current->spa_meta_objset, checkpoint_sm); space_map_close(checkpoint_sm); @@ -4559,7 +4573,7 @@ verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_ * are part of the checkpoint were freed by mistake. */ range_tree_walk(ckpoint_msp->ms_allocatable, - (range_tree_func_t *)range_tree_verify, + (range_tree_func_t *)range_tree_verify_not_present, current_msp->ms_allocatable); } } @@ -4571,6 +4585,8 @@ verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_ static void verify_checkpoint_blocks(spa_t *spa) { + ASSERT(!dump_opt['L']); + spa_t *checkpoint_spa; char *checkpoint_pool; nvlist_t *config = NULL; @@ -4636,7 +4652,6 @@ dump_leftover_checkpoint_blocks(spa_t *spa) VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); - space_map_update(checkpoint_sm); dump_spacemap(spa->spa_meta_objset, checkpoint_sm); space_map_close(checkpoint_sm); } Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c Thu Nov 21 13:22:23 2019 (r354947) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c Thu Nov 21 13:35:43 2019 (r354948) @@ -584,45 +584,62 @@ metaslab_compare(const void *x1, const void *x2) return (AVL_CMP(m1->ms_start, m2->ms_start)); } +uint64_t +metaslab_allocated_space(metaslab_t *msp) +{ + return (msp->ms_allocated_space); +} + /* * Verify that the space accounting on disk matches the in-core range_trees. */ -void +static void metaslab_verify_space(metaslab_t *msp, uint64_t txg) { spa_t *spa = msp->ms_group->mg_vd->vdev_spa; - uint64_t allocated = 0; + uint64_t allocating = 0; uint64_t sm_free_space, msp_free_space; ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(!msp->ms_condensing); if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) return; /* * We can only verify the metaslab space when we're called - * from syncing context with a loaded metaslab that has an allocated - * space map. Calling this in non-syncing context does not - * provide a consistent view of the metaslab since we're performing - * allocations in the future. + * from syncing context with a loaded metaslab that has an + * allocated space map. Calling this in non-syncing context + * does not provide a consistent view of the metaslab since + * we're performing allocations in the future. */ if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || !msp->ms_loaded) return; - sm_free_space = msp->ms_size - space_map_allocated(msp->ms_sm) - - space_map_alloc_delta(msp->ms_sm); + /* + * Even though the smp_alloc field can get negative (e.g. + * see vdev_checkpoint_sm), that should never be the case + * when it come's to a metaslab's space map. + */ + ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); + sm_free_space = msp->ms_size - metaslab_allocated_space(msp); + /* - * Account for future allocations since we would have already - * deducted that space from the ms_freetree. + * Account for future allocations since we would have + * already deducted that space from the ms_allocatable. */ for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { - allocated += + allocating += range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); } - msp_free_space = range_tree_space(msp->ms_allocatable) + allocated + + ASSERT3U(msp->ms_deferspace, ==, + range_tree_space(msp->ms_defer[0]) + + range_tree_space(msp->ms_defer[1])); + + msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + msp->ms_deferspace + range_tree_space(msp->ms_freed); VERIFY3U(sm_free_space, ==, msp_free_space); @@ -929,6 +946,7 @@ metaslab_group_histogram_verify(metaslab_group_t *mg) for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; + ASSERT(msp != NULL); /* skip if not active or not a member */ if (msp->ms_sm == NULL || msp->ms_group != mg) @@ -1470,7 +1488,204 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; * ========================================================================== */ +static void +metaslab_aux_histograms_clear(metaslab_t *msp) +{ + /* + * Auxiliary histograms are only cleared when resetting them, + * which can only happen while the metaslab is loaded. + */ + ASSERT(msp->ms_loaded); + + bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); + for (int t = 0; t < TXG_DEFER_SIZE; t++) + bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t])); +} + +static void +metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift, + range_tree_t *rt) +{ + /* + * This is modeled after space_map_histogram_add(), so refer to that + * function for implementation details. We want this to work like + * the space map histogram, and not the range tree histogram, as we + * are essentially constructing a delta that will be later subtracted + * from the space map histogram. + */ + int idx = 0; + for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + ASSERT3U(i, >=, idx + shift); + histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); + + if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { + ASSERT3U(idx + shift, ==, i); + idx++; + ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); + } + } +} + /* + * Called at every sync pass that the metaslab gets synced. + * + * The reason is that we want our auxiliary histograms to be updated + * wherever the metaslab's space map histogram is updated. This way + * we stay consistent on which parts of the metaslab space map's + * histogram are currently not available for allocations (e.g because + * they are in the defer, freed, and freeing trees). + */ +static void +metaslab_aux_histograms_update(metaslab_t *msp) +{ + space_map_t *sm = msp->ms_sm; + ASSERT(sm != NULL); + + /* + * This is similar to the metaslab's space map histogram updates + * that take place in metaslab_sync(). The only difference is that + * we only care about segments that haven't made it into the + * ms_allocatable tree yet. + */ + if (msp->ms_loaded) { + metaslab_aux_histograms_clear(msp); + + metaslab_aux_histogram_add(msp->ms_synchist, + sm->sm_shift, msp->ms_freed); + + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + metaslab_aux_histogram_add(msp->ms_deferhist[t], + sm->sm_shift, msp->ms_defer[t]); + } + } + + metaslab_aux_histogram_add(msp->ms_synchist, + sm->sm_shift, msp->ms_freeing); +} + +/* + * Called every time we are done syncing (writing to) the metaslab, + * i.e. at the end of each sync pass. + * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist] + */ +static void +metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + space_map_t *sm = msp->ms_sm; + + if (sm == NULL) { + /* + * We came here from metaslab_init() when creating/opening a + * pool, looking at a metaslab that hasn't had any allocations + * yet. + */ + return; + } + + /* + * This is similar to the actions that we take for the ms_freed + * and ms_defer trees in metaslab_sync_done(). + */ + uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE; + if (defer_allowed) { + bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index], + sizeof (msp->ms_synchist)); + } else { + bzero(msp->ms_deferhist[hist_index], + sizeof (msp->ms_deferhist[hist_index])); + } + bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); +} + +/* + * Ensure that the metaslab's weight and fragmentation are consistent + * with the contents of the histogram (either the range tree's histogram + * or the space map's depending whether the metaslab is loaded). + */ +static void +metaslab_verify_weight_and_frag(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) + return; + + /* see comment in metaslab_verify_unflushed_changes() */ + if (msp->ms_group == NULL) + return; + + /* + * Devices being removed always return a weight of 0 and leave + * fragmentation and ms_max_size as is - there is nothing for + * us to verify here. + */ + vdev_t *vd = msp->ms_group->mg_vd; + if (vd->vdev_removing) + return; + + /* + * If the metaslab is dirty it probably means that we've done + * some allocations or frees that have changed our histograms + * and thus the weight. + */ + for (int t = 0; t < TXG_SIZE; t++) { + if (txg_list_member(&vd->vdev_ms_list, msp, t)) + return; + } + + /* + * This verification checks that our in-memory state is consistent + * with what's on disk. If the pool is read-only then there aren't + * any changes and we just have the initially-loaded state. + */ + if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) + return; + + /* some extra verification for in-core tree if you can */ + if (msp->ms_loaded) { + range_tree_stat_verify(msp->ms_allocatable); + VERIFY(space_map_histogram_verify(msp->ms_sm, + msp->ms_allocatable)); + } + + uint64_t weight = msp->ms_weight; + uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; + boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); + uint64_t frag = msp->ms_fragmentation; + uint64_t max_segsize = msp->ms_max_size; + + msp->ms_weight = 0; + msp->ms_fragmentation = 0; + msp->ms_max_size = 0; + + /* + * This function is used for verification purposes. Regardless of + * whether metaslab_weight() thinks this metaslab should be active or + * not, we want to ensure that the actual weight (and therefore the + * value of ms_weight) would be the same if it was to be recalculated + * at this point. + */ + msp->ms_weight = metaslab_weight(msp) | was_active; + + VERIFY3U(max_segsize, ==, msp->ms_max_size); + + /* + * If the weight type changed then there is no point in doing + * verification. Revert fields to their original values. + */ + if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || + (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { + msp->ms_fragmentation = frag; + msp->ms_weight = weight; + return; + } + + VERIFY3U(msp->ms_fragmentation, ==, frag); + VERIFY3U(msp->ms_weight, ==, weight); +} + +/* * Wait for any in-progress metaslab loads to complete. */ static void @@ -1491,47 +1706,94 @@ metaslab_load_impl(metaslab_t *msp) ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(msp->ms_loading); + ASSERT(!msp->ms_condensing); /* - * Nobody else can manipulate a loading metaslab, so it's now safe - * to drop the lock. This way we don't have to hold the lock while - * reading the spacemap from disk. + * We temporarily drop the lock to unblock other operations while we + * are reading the space map. Therefore, metaslab_sync() and + * metaslab_sync_done() can run at the same time as we do. + * + * metaslab_sync() can append to the space map while we are loading. + * Therefore we load only entries that existed when we started the + * load. Additionally, metaslab_sync_done() has to wait for the load + * to complete because there are potential races like metaslab_load() + * loading parts of the space map that are currently being appended + * by metaslab_sync(). If we didn't, the ms_allocatable would have + * entries that metaslab_sync_done() would try to re-add later. + * + * That's why before dropping the lock we remember the synced length + * of the metaslab and read up to that point of the space map, + * ignoring entries appended by metaslab_sync() that happen after we + * drop the lock. */ + uint64_t length = msp->ms_synced_length; mutex_exit(&msp->ms_lock); - /* - * If the space map has not been allocated yet, then treat - * all the space in the metaslab as free and add it to ms_allocatable. - */ if (msp->ms_sm != NULL) { - error = space_map_load(msp->ms_sm, msp->ms_allocatable, - SM_FREE); + error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, + SM_FREE, length); } else { + /* + * The space map has not been allocated yet, so treat + * all the space in the metaslab as free and add it to the + * ms_allocatable tree. + */ range_tree_add(msp->ms_allocatable, msp->ms_start, msp->ms_size); } + /* + * We need to grab the ms_sync_lock to prevent metaslab_sync() from + * changing the ms_sm and the metaslab's range trees while we are + * about to use them and populate the ms_allocatable. The ms_lock + * is insufficient for this because metaslab_sync() doesn't hold + * the ms_lock while writing the ms_checkpointing tree to disk. + */ + mutex_enter(&msp->ms_sync_lock); mutex_enter(&msp->ms_lock); + ASSERT(!msp->ms_condensing); - if (error != 0) + if (error != 0) { + mutex_exit(&msp->ms_sync_lock); return (error); + } ASSERT3P(msp->ms_group, !=, NULL); msp->ms_loaded = B_TRUE; /* - * If the metaslab already has a spacemap, then we need to - * remove all segments from the defer tree; otherwise, the - * metaslab is completely empty and we can skip this. + * The ms_allocatable contains the segments that exist in the + * ms_defer trees [see ms_synced_length]. Thus we need to remove + * them from ms_allocatable as they will be added again in + * metaslab_sync_done(). */ - if (msp->ms_sm != NULL) { - for (int t = 0; t < TXG_DEFER_SIZE; t++) { - range_tree_walk(msp->ms_defer[t], - range_tree_remove, msp->ms_allocatable); - } + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + range_tree_walk(msp->ms_defer[t], + range_tree_remove, msp->ms_allocatable); } + + /* + * Call metaslab_recalculate_weight_and_sort() now that the + * metaslab is loaded so we get the metaslab's real weight. + * + * Unless this metaslab was created with older software and + * has not yet been converted to use segment-based weight, we + * expect the new weight to be better or equal to the weight + * that the metaslab had while it was not loaded. This is + * because the old weight does not take into account the + * consolidation of adjacent segments between TXGs. [see + * comment for ms_synchist and ms_deferhist[] for more info] + */ + uint64_t weight = msp->ms_weight; + metaslab_recalculate_weight_and_sort(msp); + if (!WEIGHT_IS_SPACEBASED(weight)) + ASSERT3U(weight, <=, msp->ms_weight); msp->ms_max_size = metaslab_block_maxsize(msp); + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + metaslab_verify_space(msp, spa_syncing_txg(spa)); + mutex_exit(&msp->ms_sync_lock); + return (0); } @@ -1548,6 +1810,7 @@ metaslab_load(metaslab_t *msp) if (msp->ms_loaded) return (0); VERIFY(!msp->ms_loading); + ASSERT(!msp->ms_condensing); msp->ms_loading = B_TRUE; int error = metaslab_load_impl(msp); @@ -1561,10 +1824,29 @@ void metaslab_unload(metaslab_t *msp) { ASSERT(MUTEX_HELD(&msp->ms_lock)); + + metaslab_verify_weight_and_frag(msp); + range_tree_vacate(msp->ms_allocatable, NULL, NULL); msp->ms_loaded = B_FALSE; + msp->ms_weight &= ~METASLAB_ACTIVE_MASK; msp->ms_max_size = 0; + + /* + * We explicitly recalculate the metaslab's weight based on its space + * map (as it is now not loaded). We want unload metaslabs to always + * have their weights calculated from the space map histograms, while + * loaded ones have it calculated from their in-core range tree + * [see metaslab_load()]. This way, the weight reflects the information + * available in-core, whether it is loaded or not + * + * If ms_group == NULL means that we came here from metaslab_fini(), + * at which point it doesn't make sense for us to do the recalculation + * and the sorting. + */ + if (msp->ms_group != NULL) + metaslab_recalculate_weight_and_sort(msp); } static void @@ -1604,6 +1886,13 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint6 /* * We only open space map objects that already exist. All others * will be opened when we finally allocate an object for it. + * + * Note: + * When called from vdev_expand(), we can't call into the DMU as + * we are holding the spa_config_lock as a writer and we would + * deadlock [see relevant comment in vdev_metaslab_init()]. in + * that case, the object parameter is zero though, so we won't + * call into the DMU. */ if (object != 0) { error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, @@ -1615,14 +1904,17 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint6 } ASSERT(ms->ms_sm != NULL); + ASSERT3S(space_map_allocated(ms->ms_sm), >=, 0); + ms->ms_allocated_space = space_map_allocated(ms->ms_sm); } /* - * We create the main range tree here, but we don't create the + * We create the ms_allocatable here, but we don't create the * other range trees until metaslab_sync_done(). This serves * two purposes: it allows metaslab_sync_done() to detect the - * addition of new space; and for debugging, it ensures that we'd - * data fault on any attempt to use this metaslab before it's ready. + * addition of new space; and for debugging, it ensures that + * we'd data fault on any attempt to use this metaslab before + * it's ready. */ ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0); @@ -1639,8 +1931,11 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint6 * out this txg. This ensures that we don't attempt to allocate * from it before we have initialized it completely. */ - if (txg <= TXG_INITIAL) + if (txg <= TXG_INITIAL) { metaslab_sync_done(ms, 0); + metaslab_space_update(vd, mg->mg_class, + metaslab_allocated_space(ms), 0, 0); + } /* * If metaslab_debug_load is set and we're initializing a metaslab @@ -1674,7 +1969,7 @@ metaslab_fini(metaslab_t *msp) mutex_enter(&msp->ms_lock); VERIFY(msp->ms_group == NULL); metaslab_space_update(vd, mg->mg_class, - -space_map_allocated(msp->ms_sm), 0, -msp->ms_size); + -metaslab_allocated_space(msp), 0, -msp->ms_size); space_map_close(msp->ms_sm); @@ -1695,6 +1990,9 @@ metaslab_fini(metaslab_t *msp) range_tree_destroy(msp->ms_checkpointing); + for (int t = 0; t < TXG_SIZE; t++) + ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); + mutex_exit(&msp->ms_lock); cv_destroy(&msp->ms_load_cv); mutex_destroy(&msp->ms_lock); @@ -1710,7 +2008,7 @@ metaslab_fini(metaslab_t *msp) * This table defines a segment size based fragmentation metric that will * allow each metaslab to derive its own fragmentation value. This is done * by calculating the space in each bucket of the spacemap histogram and - * multiplying that by the fragmetation metric in this table. Doing + * multiplying that by the fragmentation metric in this table. Doing *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201911211335.xALDZikp029490>