Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 3 Oct 2018 02:09:37 +0000 (UTC)
From:      Alexander Motin <mav@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-11@freebsd.org
Subject:   svn commit: r339104 - in stable/11: cddl/contrib/opensolaris/cmd/zdb cddl/contrib/opensolaris/cmd/zpool cddl/contrib/opensolaris/cmd/ztest sys/cddl/contrib/opensolaris/common/zfs sys/cddl/contrib/o...
Message-ID:  <201810030209.w9329bEr040634@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mav
Date: Wed Oct  3 02:09:36 2018
New Revision: 339104
URL: https://svnweb.freebsd.org/changeset/base/339104

Log:
  MFC r336947: MFV r336946: 9238 ZFS Spacemap Encoding V2
  
  The current space map encoding has the following disadvantages:
  [1] Assuming 512 sector size each entry can represent at most 16MB for a segment.
  This makes the encoding very inefficient for large regions of space.
  [2] As vdev-wide space maps have started to be used by new features (i.e.
  device removal, zpool checkpoint) we've started imposing limits in the
  vdevs that can be used with them based on the maximum addressable offset
  (currently 64PB for a top-level vdev).
  
  The new remains backwards compatible with the old one. The introduced
  two-word entry format, besides extending the limits imposed by the single-entry
  layout, also includes a vdev field and some extra padding after its prefix.
  
  The extra padding after the prefix should is reserved for future usage (e.g.
  new prefixes for future encodings or new fields for flags). The new vdev field
  not only makes the space maps more self-descriptive, but also opens the doors
  for pool-wide space maps.
  
  One final important note is that the number of bits used for vdevs is reduced
  to 24 bits for blkptrs. That was decided as we don't know of any setups that
  use more than 16M vdevs for the time being and
  we wanted to fit the vdev field in the space map. In addition that gives us
  some extra bits in dva_t.
  
  illumos/illumos-gate@17f11284b49b98353b5119463254074fd9bc0a28
  
  Reviewed by: Matt Ahrens <mahrens@delphix.com>
  Reviewed by: George Wilson <gwilson@zfsmail.com>
  Approved by: Gordon Ross <gwr@nexenta.com>
  Author: Serapheim Dimitropoulos <serapheim@delphix.com>

Modified:
  stable/11/cddl/contrib/opensolaris/cmd/zdb/zdb.c
  stable/11/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7
  stable/11/cddl/contrib/opensolaris/cmd/ztest/ztest.c
  stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
  stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
  stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
  stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c
  stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
  stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
  stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
  stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
  stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
  stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect_mapping.c
Directory Properties:
  stable/11/   (props changed)

Modified: stable/11/cddl/contrib/opensolaris/cmd/zdb/zdb.c
==============================================================================
--- stable/11/cddl/contrib/opensolaris/cmd/zdb/zdb.c	Wed Oct  3 02:08:32 2018	(r339103)
+++ stable/11/cddl/contrib/opensolaris/cmd/zdb/zdb.c	Wed Oct  3 02:09:36 2018	(r339104)
@@ -774,7 +774,6 @@ verify_spacemap_refcounts(spa_t *spa)
 static void
 dump_spacemap(objset_t *os, space_map_t *sm)
 {
-	uint64_t alloc, offset, entry;
 	char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
 	    "INVALID", "INVALID", "INVALID", "INVALID" };
 
@@ -791,41 +790,73 @@ dump_spacemap(objset_t *os, space_map_t *sm)
 	/*
 	 * Print out the freelist entries in both encoded and decoded form.
 	 */
-	alloc = 0;
-	for (offset = 0; offset < space_map_length(sm);
-	    offset += sizeof (entry)) {
-		uint8_t mapshift = sm->sm_shift;
+	uint8_t mapshift = sm->sm_shift;
+	int64_t alloc = 0;
+	uint64_t word;
+	for (uint64_t offset = 0; offset < space_map_length(sm);
+	    offset += sizeof (word)) {
 
 		VERIFY0(dmu_read(os, space_map_object(sm), offset,
-		    sizeof (entry), &entry, DMU_READ_PREFETCH));
-		if (SM_DEBUG_DECODE(entry)) {
+		    sizeof (word), &word, DMU_READ_PREFETCH));
 
+		if (sm_entry_is_debug(word)) {
 			(void) printf("\t    [%6llu] %s: txg %llu, pass %llu\n",
-			    (u_longlong_t)(offset / sizeof (entry)),
-			    ddata[SM_DEBUG_ACTION_DECODE(entry)],
-			    (u_longlong_t)SM_DEBUG_TXG_DECODE(entry),
-			    (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry));
+			    (u_longlong_t)(offset / sizeof (word)),
+			    ddata[SM_DEBUG_ACTION_DECODE(word)],
+			    (u_longlong_t)SM_DEBUG_TXG_DECODE(word),
+			    (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word));
+			continue;
+		}
+
+		uint8_t words;
+		char entry_type;
+		uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID;
+
+		if (sm_entry_is_single_word(word)) {
+			entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ?
+			    'A' : 'F';
+			entry_off = (SM_OFFSET_DECODE(word) << mapshift) +
+			    sm->sm_start;
+			entry_run = SM_RUN_DECODE(word) << mapshift;
+			words = 1;
 		} else {
-			(void) printf("\t    [%6llu]    %c  range:"
-			    " %010llx-%010llx  size: %06llx\n",
-			    (u_longlong_t)(offset / sizeof (entry)),
-			    SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
-			    (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
-			    mapshift) + sm->sm_start),
-			    (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
-			    mapshift) + sm->sm_start +
-			    (SM_RUN_DECODE(entry) << mapshift)),
-			    (u_longlong_t)(SM_RUN_DECODE(entry) << mapshift));
-			if (SM_TYPE_DECODE(entry) == SM_ALLOC)
-				alloc += SM_RUN_DECODE(entry) << mapshift;
-			else
-				alloc -= SM_RUN_DECODE(entry) << mapshift;
+			/* it is a two-word entry so we read another word */
+			ASSERT(sm_entry_is_double_word(word));
+
+			uint64_t extra_word;
+			offset += sizeof (extra_word);
+			VERIFY0(dmu_read(os, space_map_object(sm), offset,
+			    sizeof (extra_word), &extra_word,
+			    DMU_READ_PREFETCH));
+
+			ASSERT3U(offset, <=, space_map_length(sm));
+
+			entry_run = SM2_RUN_DECODE(word) << mapshift;
+			entry_vdev = SM2_VDEV_DECODE(word);
+			entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ?
+			    'A' : 'F';
+			entry_off = (SM2_OFFSET_DECODE(extra_word) <<
+			    mapshift) + sm->sm_start;
+			words = 2;
 		}
+
+		(void) printf("\t    [%6llu]    %c  range:"
+		    " %010llx-%010llx  size: %06llx vdev: %06llu words: %u\n",
+		    (u_longlong_t)(offset / sizeof (word)),
+		    entry_type, (u_longlong_t)entry_off,
+		    (u_longlong_t)(entry_off + entry_run),
+		    (u_longlong_t)entry_run,
+		    (u_longlong_t)entry_vdev, words);
+
+		if (entry_type == 'A')
+			alloc += entry_run;
+		else
+			alloc -= entry_run;
 	}
-	if (alloc != space_map_allocated(sm)) {
-		(void) printf("space_map_object alloc (%llu) INCONSISTENT "
-		    "with space map summary (%llu)\n",
-		    (u_longlong_t)space_map_allocated(sm), (u_longlong_t)alloc);
+	if ((uint64_t)alloc != space_map_allocated(sm)) {
+		(void) printf("space_map_object alloc (%lld) INCONSISTENT "
+		    "with space map summary (%lld)\n",
+		    (longlong_t)space_map_allocated(sm), (longlong_t)alloc);
 	}
 }
 
@@ -1155,7 +1186,7 @@ dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_clas
 	while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
 		dump_dde(ddt, &dde, walk);
 
-	ASSERT(error == ENOENT);
+	ASSERT3U(error, ==, ENOENT);
 
 	(void) printf("\n");
 }
@@ -3097,15 +3128,14 @@ typedef struct checkpoint_sm_exclude_entry_arg {
 } checkpoint_sm_exclude_entry_arg_t;
 
 static int
-checkpoint_sm_exclude_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
-    void *arg)
+checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg)
 {
 	checkpoint_sm_exclude_entry_arg_t *cseea = arg;
 	vdev_t *vd = cseea->cseea_vd;
-	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
-	uint64_t end = offset + size;
+	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
+	uint64_t end = sme->sme_offset + sme->sme_run;
 
-	ASSERT(type == SM_FREE);
+	ASSERT(sme->sme_type == SM_FREE);
 
 	/*
 	 * Since the vdev_checkpoint_sm exists in the vdev level
@@ -3123,7 +3153,7 @@ checkpoint_sm_exclude_entry_cb(maptype_t type, uint64_
 	 * metaslab boundaries. So if needed we could add code
 	 * that handles metaslab-crossing segments in the future.
 	 */
-	VERIFY3U(offset, >=, ms->ms_start);
+	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
 	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
 
 	/*
@@ -3131,10 +3161,10 @@ checkpoint_sm_exclude_entry_cb(maptype_t type, uint64_
 	 * also verify that the entry is there to begin with.
 	 */
 	mutex_enter(&ms->ms_lock);
-	range_tree_remove(ms->ms_allocatable, offset, size);
+	range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
 	mutex_exit(&ms->ms_lock);
 
-	cseea->cseea_checkpoint_size += size;
+	cseea->cseea_checkpoint_size += sme->sme_run;
 	return (0);
 }
 
@@ -4109,15 +4139,14 @@ typedef struct verify_checkpoint_sm_entry_cb_arg {
 #define	ENTRIES_PER_PROGRESS_UPDATE 10000
 
 static int
-verify_checkpoint_sm_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
-    void *arg)
+verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)
 {
 	verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
 	vdev_t *vd = vcsec->vcsec_vd;
-	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
-	uint64_t end = offset + size;
+	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
+	uint64_t end = sme->sme_offset + sme->sme_run;
 
-	ASSERT(type == SM_FREE);
+	ASSERT(sme->sme_type == SM_FREE);
 
 	if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
 		(void) fprintf(stderr,
@@ -4131,7 +4160,7 @@ verify_checkpoint_sm_entry_cb(maptype_t type, uint64_t
 	/*
 	 * See comment in checkpoint_sm_exclude_entry_cb()
 	 */
-	VERIFY3U(offset, >=, ms->ms_start);
+	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
 	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
 
 	/*
@@ -4140,7 +4169,7 @@ verify_checkpoint_sm_entry_cb(maptype_t type, uint64_t
 	 * their respective ms_allocateable trees should not contain them.
 	 */
 	mutex_enter(&ms->ms_lock);
-	range_tree_verify(ms->ms_allocatable, offset, size);
+	range_tree_verify(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
 	mutex_exit(&ms->ms_lock);
 
 	return (0);
@@ -4386,7 +4415,7 @@ verify_checkpoint(spa_t *spa)
 	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
 	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
 
-	if (error == ENOENT) {
+	if (error == ENOENT && !dump_opt['L']) {
 		/*
 		 * If the feature is active but the uberblock is missing
 		 * then we must be in the middle of discarding the
@@ -4409,7 +4438,7 @@ verify_checkpoint(spa_t *spa)
 		error = 3;
 	}
 
-	if (error == 0)
+	if (error == 0 && !dump_opt['L'])
 		verify_checkpoint_blocks(spa);
 
 	return (error);
@@ -4514,7 +4543,7 @@ dump_zpool(spa_t *spa)
 	if (dump_opt['h'])
 		dump_history(spa);
 
-	if (rc == 0 && !dump_opt['L'])
+	if (rc == 0)
 		rc = verify_checkpoint(spa);
 
 	if (rc != 0) {

Modified: stable/11/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7
==============================================================================
--- stable/11/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7	Wed Oct  3 02:08:32 2018	(r339103)
+++ stable/11/cddl/contrib/opensolaris/cmd/zpool/zpool-features.7	Wed Oct  3 02:09:36 2018	(r339104)
@@ -482,6 +482,24 @@ This feature becomes
 when the "zpool remove" command is
 used on a top-level vdev, and will never return to being
 .Sy enabled .
+.It Sy spacemap_v2
+.Bl -column "READ\-ONLY COMPATIBLE" "com.delphix:spacemap_v2"
+.It GUID Ta com.delphix:spacemap_v2
+.It READ\-ONLY COMPATIBLE Ta yes
+.It DEPENDENCIES Ta none
+.El
+.Pp
+This feature enables the use of the new space map encoding which
+consists of two words (instead of one) whenever it is advantageous.
+The new encoding allows space maps to represent large regions of
+space more efficiently on-disk while also increasing their maximum
+addressable offset.
+.Pp
+This feature becomes
+.Sy active
+as soon as it is enabled and will
+never return to being
+.Sy enabled .
 .It Sy large_blocks
 .Bl -column "READ\-ONLY COMPATIBLE" "org.open-zfs:large_block"
 .It GUID Ta org.open-zfs:large_block

Modified: stable/11/cddl/contrib/opensolaris/cmd/ztest/ztest.c
==============================================================================
--- stable/11/cddl/contrib/opensolaris/cmd/ztest/ztest.c	Wed Oct  3 02:08:32 2018	(r339103)
+++ stable/11/cddl/contrib/opensolaris/cmd/ztest/ztest.c	Wed Oct  3 02:09:36 2018	(r339104)
@@ -195,6 +195,7 @@ extern uint64_t zfs_deadman_synctime_ms;
 extern int metaslab_preload_limit;
 extern boolean_t zfs_compressed_arc_enabled;
 extern boolean_t zfs_abd_scatter_enabled;
+extern boolean_t zfs_force_some_double_word_sm_entries;
 
 static ztest_shared_opts_t *ztest_shared_opts;
 static ztest_shared_opts_t ztest_opts;
@@ -6397,6 +6398,12 @@ main(int argc, char **argv)
 
 	dprintf_setup(&argc, argv);
 	zfs_deadman_synctime_ms = 300000;
+	/*
+	 * As two-word space map entries may not come up often (especially
+	 * if pool and vdev sizes are small) we want to force at least some
+	 * of them so the feature get tested.
+	 */
+	zfs_force_some_double_word_sm_entries = B_TRUE;
 
 	ztest_fd_rand = open("/dev/urandom", O_RDONLY);
 	ASSERT3S(ztest_fd_rand, >=, 0);

Modified: stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c	Wed Oct  3 02:08:32 2018	(r339103)
+++ stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.c	Wed Oct  3 02:09:36 2018	(r339104)
@@ -229,6 +229,12 @@ zpool_feature_init(void)
 	    "Pool state can be checkpointed, allowing rewind later.",
 	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
 
+	zfeature_register(SPA_FEATURE_SPACEMAP_V2,
+	    "com.delphix:spacemap_v2", "spacemap_v2",
+	    "Space maps representing large segments are more efficient.",
+	    ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
+	    NULL);
+
 	static const spa_feature_t large_blocks_deps[] = {
 		SPA_FEATURE_EXTENSIBLE_DATASET,
 		SPA_FEATURE_NONE

Modified: stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h	Wed Oct  3 02:08:32 2018	(r339103)
+++ stable/11/sys/cddl/contrib/opensolaris/common/zfs/zfeature_common.h	Wed Oct  3 02:09:36 2018	(r339104)
@@ -60,6 +60,7 @@ typedef enum spa_feature {
 	SPA_FEATURE_DEVICE_REMOVAL,
 	SPA_FEATURE_OBSOLETE_COUNTS,
 	SPA_FEATURE_POOL_CHECKPOINT,
+	SPA_FEATURE_SPACEMAP_V2,
 	SPA_FEATURES
 } spa_feature_t;
 

Modified: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c	Wed Oct  3 02:08:32 2018	(r339103)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c	Wed Oct  3 02:09:36 2018	(r339104)
@@ -2105,17 +2105,6 @@ metaslab_group_preload(metaslab_group_t *mg)
  *
  * 3. The on-disk size of the space map should actually decrease.
  *
- * Checking the first condition is tricky since we don't want to walk
- * the entire AVL tree calculating the estimated on-disk size. Instead we
- * use the size-ordered range tree in the metaslab and calculate the
- * size required to write out the largest segment in our free tree. If the
- * size required to represent that segment on disk is larger than the space
- * map object then we avoid condensing this map.
- *
- * To determine the second criterion we use a best-case estimate and assume
- * each segment can be represented on-disk as a single 64-bit entry. We refer
- * to this best-case estimate as the space map's minimal form.
- *
  * Unfortunately, we cannot compute the on-disk size of the space map in this
  * context because we cannot accurately compute the effects of compression, etc.
  * Instead, we apply the heuristic described in the block comment for
@@ -2126,9 +2115,6 @@ static boolean_t
 metaslab_should_condense(metaslab_t *msp)
 {
 	space_map_t *sm = msp->ms_sm;
-	range_seg_t *rs;
-	uint64_t size, entries, segsz, object_size, optimal_size, record_size;
-	dmu_object_info_t doi;
 	vdev_t *vd = msp->ms_group->mg_vd;
 	uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
 	uint64_t current_txg = spa_syncing_txg(vd->vdev_spa);
@@ -2154,34 +2140,22 @@ metaslab_should_condense(metaslab_t *msp)
 	msp->ms_condense_checked_txg = current_txg;
 
 	/*
-	 * Use the ms_allocatable_by_size range tree, which is ordered by
-	 * size, to obtain the largest segment in the free tree. We always
-	 * condense metaslabs that are empty and metaslabs for which a
-	 * condense request has been made.
+	 * We always condense metaslabs that are empty and metaslabs for
+	 * which a condense request has been made.
 	 */
-	rs = avl_last(&msp->ms_allocatable_by_size);
-	if (rs == NULL || msp->ms_condense_wanted)
+	if (avl_is_empty(&msp->ms_allocatable_by_size) ||
+	    msp->ms_condense_wanted)
 		return (B_TRUE);
 
-	/*
-	 * Calculate the number of 64-bit entries this segment would
-	 * require when written to disk. If this single segment would be
-	 * larger on-disk than the entire current on-disk structure, then
-	 * clearly condensing will increase the on-disk structure size.
-	 */
-	size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
-	entries = size / (MIN(size, SM_RUN_MAX));
-	segsz = entries * sizeof (uint64_t);
+	uint64_t object_size = space_map_length(msp->ms_sm);
+	uint64_t optimal_size = space_map_estimate_optimal_size(sm,
+	    msp->ms_allocatable, SM_NO_VDEVID);
 
-	optimal_size =
-	    sizeof (uint64_t) * avl_numnodes(&msp->ms_allocatable->rt_root);
-	object_size = space_map_length(msp->ms_sm);
-
+	dmu_object_info_t doi;
 	dmu_object_info_from_db(sm->sm_dbuf, &doi);
-	record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
+	uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
 
-	return (segsz <= object_size &&
-	    object_size >= (optimal_size * zfs_condense_pct / 100) &&
+	return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
 	    object_size > zfs_metaslab_condense_block_threshold * record_size);
 }
 
@@ -2256,11 +2230,11 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_t
 	 * optimal, this is typically close to optimal, and much cheaper to
 	 * compute.
 	 */
-	space_map_write(sm, condense_tree, SM_ALLOC, tx);
+	space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx);
 	range_tree_vacate(condense_tree, NULL, NULL);
 	range_tree_destroy(condense_tree);
 
-	space_map_write(sm, msp->ms_allocatable, SM_FREE, tx);
+	space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
 	mutex_enter(&msp->ms_lock);
 	msp->ms_condensing = B_FALSE;
 }
@@ -2372,8 +2346,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		metaslab_condense(msp, txg, tx);
 	} else {
 		mutex_exit(&msp->ms_lock);
-		space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
-		space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, tx);
+		space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
+		    SM_NO_VDEVID, tx);
+		space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
+		    SM_NO_VDEVID, tx);
 		mutex_enter(&msp->ms_lock);
 	}
 
@@ -2388,7 +2364,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		 */
 		mutex_exit(&msp->ms_lock);
 		space_map_write(vd->vdev_checkpoint_sm,
-		    msp->ms_checkpointing, SM_FREE, tx);
+		    msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
 		mutex_enter(&msp->ms_lock);
 		space_map_update(vd->vdev_checkpoint_sm);
 

Modified: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c	Wed Oct  3 02:08:32 2018	(r339103)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_checkpoint.c	Wed Oct  3 02:09:36 2018	(r339104)
@@ -203,13 +203,12 @@ typedef struct spa_checkpoint_discard_sync_callback_ar
 } spa_checkpoint_discard_sync_callback_arg_t;
 
 static int
-spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset,
-    uint64_t size, void *arg)
+spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg)
 {
 	spa_checkpoint_discard_sync_callback_arg_t *sdc = arg;
 	vdev_t *vd = sdc->sdc_vd;
-	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
-	uint64_t end = offset + size;
+	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
+	uint64_t end = sme->sme_offset + sme->sme_run;
 
 	if (sdc->sdc_entry_limit == 0)
 		return (EINTR);
@@ -224,8 +223,8 @@ spa_checkpoint_discard_sync_callback(maptype_t type, u
 	 * metaslab boundaries. So if needed we could add code
 	 * that handles metaslab-crossing segments in the future.
 	 */
-	VERIFY3U(type, ==, SM_FREE);
-	VERIFY3U(offset, >=, ms->ms_start);
+	VERIFY3U(sme->sme_type, ==, SM_FREE);
+	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
 	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
 
 	/*
@@ -237,14 +236,15 @@ spa_checkpoint_discard_sync_callback(maptype_t type, u
 	mutex_enter(&ms->ms_lock);
 	if (range_tree_is_empty(ms->ms_freeing))
 		vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg);
-	range_tree_add(ms->ms_freeing, offset, size);
+	range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run);
 	mutex_exit(&ms->ms_lock);
 
-	ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, size);
-	ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, size);
+	ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=,
+	    sme->sme_run);
+	ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run);
 
-	vd->vdev_spa->spa_checkpoint_info.sci_dspace -= size;
-	vd->vdev_stat.vs_checkpoint_space -= size;
+	vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run;
+	vd->vdev_stat.vs_checkpoint_space -= sme->sme_run;
 	sdc->sdc_entry_limit--;
 
 	return (0);
@@ -289,13 +289,14 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t
 	 * Thus, we set the maximum entries that the space map callback
 	 * will be applied to be half the entries that could fit in the
 	 * imposed memory limit.
+	 *
+	 * Note that since this is a conservative estimate we also
+	 * assume the worst case scenario in our computation where each
+	 * entry is two-word.
 	 */
 	uint64_t max_entry_limit =
-	    (zfs_spa_discard_memory_limit / sizeof (uint64_t)) >> 1;
+	    (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1;
 
-	uint64_t entries_in_sm =
-	    space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
-
 	/*
 	 * Iterate from the end of the space map towards the beginning,
 	 * placing its entries on ms_freeing and removing them from the
@@ -318,14 +319,15 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t
 	spa_checkpoint_discard_sync_callback_arg_t sdc;
 	sdc.sdc_vd = vd;
 	sdc.sdc_txg = tx->tx_txg;
-	sdc.sdc_entry_limit = MIN(entries_in_sm, max_entry_limit);
+	sdc.sdc_entry_limit = max_entry_limit;
 
-	uint64_t entries_before = entries_in_sm;
+	uint64_t words_before =
+	    space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
 
 	error = space_map_incremental_destroy(vd->vdev_checkpoint_sm,
 	    spa_checkpoint_discard_sync_callback, &sdc, tx);
 
-	uint64_t entries_after =
+	uint64_t words_after =
 	    space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
 
 #ifdef DEBUG
@@ -333,9 +335,9 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t
 #endif
 
 	zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, "
-	    "deleted %llu entries - %llu entries are left",
-	    tx->tx_txg, vd->vdev_id, (entries_before - entries_after),
-	    entries_after);
+	    "deleted %llu words - %llu words are left",
+	    tx->tx_txg, vd->vdev_id, (words_before - words_after),
+	    words_after);
 
 	if (error != EINTR) {
 		if (error != 0) {
@@ -344,15 +346,15 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t
 			    "space map of vdev %llu\n",
 			    error, vd->vdev_id);
 		}
-		ASSERT0(entries_after);
+		ASSERT0(words_after);
 		ASSERT0(vd->vdev_checkpoint_sm->sm_alloc);
-		ASSERT0(vd->vdev_checkpoint_sm->sm_length);
+		ASSERT0(space_map_length(vd->vdev_checkpoint_sm));
 
 		space_map_free(vd->vdev_checkpoint_sm, tx);
 		space_map_close(vd->vdev_checkpoint_sm);
 		vd->vdev_checkpoint_sm = NULL;
 
-		VERIFY0(zap_remove(vd->vdev_spa->spa_meta_objset,
+		VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa),
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx));
 	}
 }

Modified: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c	Wed Oct  3 02:08:32 2018	(r339103)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c	Wed Oct  3 02:09:36 2018	(r339104)
@@ -43,68 +43,194 @@ SYSCTL_DECL(_vfs_zfs);
  * Note on space map block size:
  *
  * The data for a given space map can be kept on blocks of any size.
- * Larger blocks entail fewer i/o operations, but they also cause the
- * DMU to keep more data in-core, and also to waste more i/o bandwidth
+ * Larger blocks entail fewer I/O operations, but they also cause the
+ * DMU to keep more data in-core, and also to waste more I/O bandwidth
  * when only a few blocks have changed since the last transaction group.
  */
 
 /*
+ * Enabled whenever we want to stress test the use of double-word
+ * space map entries.
+ */
+boolean_t zfs_force_some_double_word_sm_entries = B_FALSE;
+
+boolean_t
+sm_entry_is_debug(uint64_t e)
+{
+	return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX);
+}
+
+boolean_t
+sm_entry_is_single_word(uint64_t e)
+{
+	uint8_t prefix = SM_PREFIX_DECODE(e);
+	return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX);
+}
+
+boolean_t
+sm_entry_is_double_word(uint64_t e)
+{
+	return (SM_PREFIX_DECODE(e) == SM2_PREFIX);
+}
+
+/*
  * Iterate through the space map, invoking the callback on each (non-debug)
  * space map entry.
  */
 int
 space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
 {
-	uint64_t *entry, *entry_map, *entry_map_end;
-	uint64_t bufsize, size, offset, end;
+	uint64_t sm_len = space_map_length(sm);
+	ASSERT3U(sm->sm_blksz, !=, 0);
+
+	dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, sm_len,
+	    ZIO_PRIORITY_SYNC_READ);
+
+	uint64_t blksz = sm->sm_blksz;
 	int error = 0;
+	for (uint64_t block_base = 0; block_base < sm_len && error == 0;
+	    block_base += blksz) {
+		dmu_buf_t *db;
+		error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
+		    block_base, FTAG, &db, DMU_READ_PREFETCH);
+		if (error != 0)
+			return (error);
 
-	end = space_map_length(sm);
+		uint64_t *block_start = db->db_data;
+		uint64_t block_length = MIN(sm_len - block_base, blksz);
+		uint64_t *block_end = block_start +
+		    (block_length / sizeof (uint64_t));
 
-	bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
-	entry_map = zio_buf_alloc(bufsize);
+		VERIFY0(P2PHASE(block_length, sizeof (uint64_t)));
+		VERIFY3U(block_length, !=, 0);
+		ASSERT3U(blksz, ==, db->db_size);
 
-	if (end > bufsize) {
-		dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize,
-		    end - bufsize, ZIO_PRIORITY_SYNC_READ);
+		for (uint64_t *block_cursor = block_start;
+		    block_cursor < block_end && error == 0; block_cursor++) {
+			uint64_t e = *block_cursor;
+
+			if (sm_entry_is_debug(e)) /* Skip debug entries */
+				continue;
+
+			uint64_t raw_offset, raw_run, vdev_id;
+			maptype_t type;
+			if (sm_entry_is_single_word(e)) {
+				type = SM_TYPE_DECODE(e);
+				vdev_id = SM_NO_VDEVID;
+				raw_offset = SM_OFFSET_DECODE(e);
+				raw_run = SM_RUN_DECODE(e);
+			} else {
+				/* it is a two-word entry */
+				ASSERT(sm_entry_is_double_word(e));
+				raw_run = SM2_RUN_DECODE(e);
+				vdev_id = SM2_VDEV_DECODE(e);
+
+				/* move on to the second word */
+				block_cursor++;
+				e = *block_cursor;
+				VERIFY3P(block_cursor, <=, block_end);
+
+				type = SM2_TYPE_DECODE(e);
+				raw_offset = SM2_OFFSET_DECODE(e);
+			}
+
+			uint64_t entry_offset = (raw_offset << sm->sm_shift) +
+			    sm->sm_start;
+			uint64_t entry_run = raw_run << sm->sm_shift;
+
+			VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
+			VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
+			ASSERT3U(entry_offset, >=, sm->sm_start);
+			ASSERT3U(entry_offset, <, sm->sm_start + sm->sm_size);
+			ASSERT3U(entry_run, <=, sm->sm_size);
+			ASSERT3U(entry_offset + entry_run, <=,
+			    sm->sm_start + sm->sm_size);
+
+			space_map_entry_t sme = {
+			    .sme_type = type,
+			    .sme_vdev = vdev_id,
+			    .sme_offset = entry_offset,
+			    .sme_run = entry_run
+			};
+			error = callback(&sme, arg);
+		}
+		dmu_buf_rele(db, FTAG);
 	}
+	return (error);
+}
 
-	for (offset = 0; offset < end && error == 0; offset += bufsize) {
-		size = MIN(end - offset, bufsize);
-		VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
-		VERIFY(size != 0);
-		ASSERT3U(sm->sm_blksz, !=, 0);
+/*
+ * Reads the entries from the last block of the space map into
+ * buf in reverse order. Populates nwords with number of words
+ * in the last block.
+ *
+ * Refer to block comment within space_map_incremental_destroy()
+ * to understand why this function is needed.
+ */
+static int
+space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,
+    uint64_t bufsz, uint64_t *nwords)
+{
+	int error = 0;
+	dmu_buf_t *db;
 
-		dprintf("object=%llu  offset=%llx  size=%llx\n",
-		    space_map_object(sm), offset, size);
+	/*
+	 * Find the offset of the last word in the space map and use
+	 * that to read the last block of the space map with
+	 * dmu_buf_hold().
+	 */
+	uint64_t last_word_offset =
+	    sm->sm_phys->smp_objsize - sizeof (uint64_t);
+	error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
+	    FTAG, &db, DMU_READ_NO_PREFETCH);
+	if (error != 0)
+		return (error);
 
-		error = dmu_read(sm->sm_os, space_map_object(sm), offset, size,
-		    entry_map, DMU_READ_PREFETCH);
-		if (error != 0)
-			break;
+	ASSERT3U(sm->sm_object, ==, db->db_object);
+	ASSERT3U(sm->sm_blksz, ==, db->db_size);
+	ASSERT3U(bufsz, >=, db->db_size);
+	ASSERT(nwords != NULL);
 
-		entry_map_end = entry_map + (size / sizeof (uint64_t));
-		for (entry = entry_map; entry < entry_map_end && error == 0;
-		    entry++) {
-			uint64_t e = *entry;
-			uint64_t offset, size;
+	uint64_t *words = db->db_data;
+	*nwords =
+	    (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
 
-			if (SM_DEBUG_DECODE(e))	/* Skip debug entries */
-				continue;
+	ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));
 
-			offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
-			    sm->sm_start;
-			size = SM_RUN_DECODE(e) << sm->sm_shift;
+	uint64_t n = *nwords;
+	uint64_t j = n - 1;
+	for (uint64_t i = 0; i < n; i++) {
+		uint64_t entry = words[i];
+		if (sm_entry_is_double_word(entry)) {
+			/*
+			 * Since we are populating the buffer backwards
+			 * we have to be extra careful and add the two
+			 * words of the double-word entry in the right
+			 * order.
+			 */
+			ASSERT3U(j, >, 0);
+			buf[j - 1] = entry;
 
-			VERIFY0(P2PHASE(offset, 1ULL << sm->sm_shift));
-			VERIFY0(P2PHASE(size, 1ULL << sm->sm_shift));
-			VERIFY3U(offset, >=, sm->sm_start);
-			VERIFY3U(offset + size, <=, sm->sm_start + sm->sm_size);
-			error = callback(SM_TYPE_DECODE(e), offset, size, arg);
+			i++;
+			ASSERT3U(i, <, n);
+			entry = words[i];
+			buf[j] = entry;
+			j -= 2;
+		} else {
+			ASSERT(sm_entry_is_debug(entry) ||
+			    sm_entry_is_single_word(entry));
+			buf[j] = entry;
+			j--;
 		}
 	}
 
-	zio_buf_free(entry_map, bufsize);
+	/*
+	 * Assert that we wrote backwards all the
+	 * way to the beginning of the buffer.
+	 */
+	ASSERT3S(j, ==, -1);
+
+	dmu_buf_rele(db, FTAG);
 	return (error);
 }
 
@@ -118,124 +244,122 @@ int
 space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
     dmu_tx_t *tx)
 {
-	uint64_t bufsize, len;
-	uint64_t *entry_map;
-	int error = 0;
+	uint64_t bufsz = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
+	uint64_t *buf = zio_buf_alloc(bufsz);
 
-	len = space_map_length(sm);
-	bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
-	entry_map = zio_buf_alloc(bufsize);
-
 	dmu_buf_will_dirty(sm->sm_dbuf, tx);
 
 	/*
-	 * Since we can't move the starting offset of the space map
-	 * (e.g there are reference on-disk pointing to it), we destroy
-	 * its entries incrementally starting from the end.
+	 * Ideally we would want to iterate from the beginning of the
+	 * space map to the end in incremental steps. The issue with this
+	 * approach is that we don't have any field on-disk that points
+	 * us where to start between each step. We could try zeroing out
+	 * entries that we've destroyed, but this doesn't work either as
+	 * an entry that is 0 is a valid one (ALLOC for range [0x0:0x200]).
 	 *
-	 * The logic that follows is basically the same as the one used
-	 * in space_map_iterate() but it traverses the space map
-	 * backwards:
+	 * As a result, we destroy its entries incrementally starting from
+	 * the end after applying the callback to each of them.
 	 *
-	 * 1] We figure out the size of the buffer that we want to use
-	 *    to read the on-disk space map entries.
-	 * 2] We figure out the offset at the end of the space map where
-	 *    we will start reading entries into our buffer.
-	 * 3] We read the on-disk entries into the buffer.
-	 * 4] We iterate over the entries from end to beginning calling
-	 *    the callback function on each one. As we move from entry
-	 *    to entry we decrease the size of the space map, deleting
-	 *    effectively each entry.
-	 * 5] If there are no more entries in the space map or the
-	 *    callback returns a value other than 0, we stop iterating
-	 *    over the space map. If there are entries remaining and
-	 *    the callback returned zero we go back to step [1].
+	 * The problem with this approach is that we cannot literally
+	 * iterate through the words in the space map backwards as we
+	 * can't distinguish two-word space map entries from their second
+	 * word. Thus we do the following:
+	 *
+	 * 1] We get all the entries from the last block of the space map
+	 *    and put them into a buffer in reverse order. This way the
+	 *    last entry comes first in the buffer, the second to last is
+	 *    second, etc.
+	 * 2] We iterate through the entries in the buffer and we apply
+	 *    the callback to each one. As we move from entry to entry we
+	 *    we decrease the size of the space map, deleting effectively
+	 *    each entry.
+	 * 3] If there are no more entries in the space map or the callback
+	 *    returns a value other than 0, we stop iterating over the
+	 *    space map. If there are entries remaining and the callback
+	 *    returned 0, we go back to step [1].
 	 */
-	uint64_t offset = 0, size = 0;
-	while (len > 0 && error == 0) {
-		size = MIN(bufsize, len);
-
-		VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
-		VERIFY3U(size, >, 0);
-		ASSERT3U(sm->sm_blksz, !=, 0);
-
-		offset = len - size;
-
-		IMPLY(bufsize > len, offset == 0);
-		IMPLY(bufsize == len, offset == 0);
-		IMPLY(bufsize < len, offset > 0);
-
-
-		EQUIV(size == len, offset == 0);
-		IMPLY(size < len, bufsize < len);
-
-		dprintf("object=%llu  offset=%llx  size=%llx\n",
-		    space_map_object(sm), offset, size);
-
-		error = dmu_read(sm->sm_os, space_map_object(sm),
-		    offset, size, entry_map, DMU_READ_PREFETCH);
+	int error = 0;
+	while (space_map_length(sm) > 0 && error == 0) {
+		uint64_t nwords = 0;
+		error = space_map_reversed_last_block_entries(sm, buf, bufsz,
+		    &nwords);
 		if (error != 0)
 			break;
 
-		uint64_t num_entries = size / sizeof (uint64_t);
+		ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t));
 
-		ASSERT3U(num_entries, >, 0);
+		for (uint64_t i = 0; i < nwords; i++) {
+			uint64_t e = buf[i];
 
-		while (num_entries > 0) {
-			uint64_t e, entry_offset, entry_size;
+			if (sm_entry_is_debug(e)) {
+				sm->sm_phys->smp_objsize -= sizeof (uint64_t);
+				space_map_update(sm);
+				continue;
+			}
+
+			int words = 1;
+			uint64_t raw_offset, raw_run, vdev_id;
 			maptype_t type;
+			if (sm_entry_is_single_word(e)) {
+				type = SM_TYPE_DECODE(e);
+				vdev_id = SM_NO_VDEVID;
+				raw_offset = SM_OFFSET_DECODE(e);
+				raw_run = SM_RUN_DECODE(e);
+			} else {
+				ASSERT(sm_entry_is_double_word(e));
+				words = 2;
 
-			e = entry_map[num_entries - 1];
+				raw_run = SM2_RUN_DECODE(e);
+				vdev_id = SM2_VDEV_DECODE(e);
 
-			ASSERT3U(num_entries, >, 0);
-			ASSERT0(error);
+				/* move to the second word */
+				i++;
+				e = buf[i];
 
-			if (SM_DEBUG_DECODE(e)) {
-				sm->sm_phys->smp_objsize -= sizeof (uint64_t);
-				space_map_update(sm);
-				len -= sizeof (uint64_t);
-				num_entries--;
-				continue;
+				ASSERT3P(i, <=, nwords);
+
+				type = SM2_TYPE_DECODE(e);
+				raw_offset = SM2_OFFSET_DECODE(e);
 			}
 
-			type = SM_TYPE_DECODE(e);
-			entry_offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
-			    sm->sm_start;
-			entry_size = SM_RUN_DECODE(e) << sm->sm_shift;
+			uint64_t entry_offset =
+			    (raw_offset << sm->sm_shift) + sm->sm_start;
+			uint64_t entry_run = raw_run << sm->sm_shift;
 
 			VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
-			VERIFY0(P2PHASE(entry_size, 1ULL << sm->sm_shift));
+			VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
 			VERIFY3U(entry_offset, >=, sm->sm_start);
-			VERIFY3U(entry_offset + entry_size, <=,
+			VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size);
+			VERIFY3U(entry_run, <=, sm->sm_size);
+			VERIFY3U(entry_offset + entry_run, <=,
 			    sm->sm_start + sm->sm_size);
 
-			error = callback(type, entry_offset, entry_size, arg);
+			space_map_entry_t sme = {
+			    .sme_type = type,
+			    .sme_vdev = vdev_id,
+			    .sme_offset = entry_offset,
+			    .sme_run = entry_run
+			};
+			error = callback(&sme, arg);
 			if (error != 0)
 				break;
 
 			if (type == SM_ALLOC)
-				sm->sm_phys->smp_alloc -= entry_size;
+				sm->sm_phys->smp_alloc -= entry_run;
 			else
-				sm->sm_phys->smp_alloc += entry_size;
-
-			sm->sm_phys->smp_objsize -= sizeof (uint64_t);
+				sm->sm_phys->smp_alloc += entry_run;
+			sm->sm_phys->smp_objsize -= words * sizeof (uint64_t);
 			space_map_update(sm);
-			len -= sizeof (uint64_t);
-			num_entries--;
 		}
-		IMPLY(error == 0, num_entries == 0);
-		EQUIV(offset == 0 && error == 0, len == 0 && num_entries == 0);
 	}
 
-	if (len == 0) {
+	if (space_map_length(sm) == 0) {
 		ASSERT0(error);
-		ASSERT0(offset);
-		ASSERT0(sm->sm_length);
 		ASSERT0(sm->sm_phys->smp_objsize);
 		ASSERT0(sm->sm_alloc);
 	}
 
-	zio_buf_free(entry_map, bufsize);
+	zio_buf_free(buf, bufsz);
 	return (error);
 }
 
@@ -246,16 +370,15 @@ typedef struct space_map_load_arg {
 } space_map_load_arg_t;
 
 static int
-space_map_load_callback(maptype_t type, uint64_t offset, uint64_t size,
-    void *arg)
+space_map_load_callback(space_map_entry_t *sme, void *arg)
 {
 	space_map_load_arg_t *smla = arg;
-	if (type == smla->smla_type) {
-		VERIFY3U(range_tree_space(smla->smla_rt) + size, <=,
+	if (sme->sme_type == smla->smla_type) {
+		VERIFY3U(range_tree_space(smla->smla_rt) + sme->sme_run, <=,
 		    smla->smla_sm->sm_size);
-		range_tree_add(smla->smla_rt, offset, size);
+		range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run);
 	} else {
-		range_tree_remove(smla->smla_rt, offset, size);
+		range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run);
 	}
 
 	return (0);
@@ -367,43 +490,239 @@ space_map_histogram_add(space_map_t *sm, range_tree_t 
 	}
 }
 
-uint64_t

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201810030209.w9329bEr040634>