Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 30 Jul 2018 22:56:25 +0000 (UTC)
From:      Alexander Motin <mav@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org
Subject:   svn commit: r336946 - vendor-sys/illumos/dist/common/zfs vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/sys vendor/illumos/dist/cmd/zdb vendor/illumos/dist/cmd/...
Message-ID:  <201807302256.w6UMuPvv043700@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mav
Date: Mon Jul 30 22:56:24 2018
New Revision: 336946
URL: https://svnweb.freebsd.org/changeset/base/336946

Log:
  9238 ZFS Spacemap Encoding V2
  
  The current space map encoding has the following disadvantages:
  [1] Assuming 512 sector size each entry can represent at most 16MB for a segment.
  This makes the encoding very inefficient for large regions of space.
  [2] As vdev-wide space maps have started to be used by new features (i.e.
  device removal, zpool checkpoint) we've started imposing limits in the
  vdevs that can be used with them based on the maximum addressable offset
  (currently 64PB for a top-level vdev).
  
  The new remains backwards compatible with the old one. The introduced
  two-word entry format, besides extending the limits imposed by the single-entry
  layout, also includes a vdev field and some extra padding after its prefix.
  
  The extra padding after the prefix should is reserved for future usage (e.g.
  new prefixes for future encodings or new fields for flags). The new vdev field
  not only makes the space maps more self-descriptive, but also opens the doors
  for pool-wide space maps.
  
  One final important note is that the number of bits used for vdevs is reduced
  to 24 bits for blkptrs. That was decided as we don't know of any setups that
  use more than 16M vdevs for the time being and
  we wanted to fit the vdev field in the space map. In addition that gives us
  some extra bits in dva_t.
  
  illumos/illumos-gate@17f11284b49b98353b5119463254074fd9bc0a28
  
  Reviewed by: Matt Ahrens <mahrens@delphix.com>
  Reviewed by: George Wilson <gwilson@zfsmail.com>
  Approved by: Gordon Ross <gwr@nexenta.com>
  Author: Serapheim Dimitropoulos <serapheim@delphix.com>

Modified:
  vendor-sys/illumos/dist/common/zfs/zfeature_common.c
  vendor-sys/illumos/dist/common/zfs/zfeature_common.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/spa_checkpoint.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/space_map.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/space_map.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect_mapping.c

Changes in other areas also in this revision:
Modified:
  vendor/illumos/dist/cmd/zdb/zdb.c
  vendor/illumos/dist/cmd/ztest/ztest.c
  vendor/illumos/dist/man/man5/zpool-features.5

Modified: vendor-sys/illumos/dist/common/zfs/zfeature_common.c
==============================================================================
--- vendor-sys/illumos/dist/common/zfs/zfeature_common.c	Mon Jul 30 22:39:30 2018	(r336945)
+++ vendor-sys/illumos/dist/common/zfs/zfeature_common.c	Mon Jul 30 22:56:24 2018	(r336946)
@@ -230,6 +230,12 @@ zpool_feature_init(void)
 	    "Pool state can be checkpointed, allowing rewind later.",
 	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
 
+	zfeature_register(SPA_FEATURE_SPACEMAP_V2,
+	    "com.delphix:spacemap_v2", "spacemap_v2",
+	    "Space maps representing large segments are more efficient.",
+	    ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE,
+	    NULL);
+
 	static const spa_feature_t large_blocks_deps[] = {
 		SPA_FEATURE_EXTENSIBLE_DATASET,
 		SPA_FEATURE_NONE

Modified: vendor-sys/illumos/dist/common/zfs/zfeature_common.h
==============================================================================
--- vendor-sys/illumos/dist/common/zfs/zfeature_common.h	Mon Jul 30 22:39:30 2018	(r336945)
+++ vendor-sys/illumos/dist/common/zfs/zfeature_common.h	Mon Jul 30 22:56:24 2018	(r336946)
@@ -59,6 +59,7 @@ typedef enum spa_feature {
 	SPA_FEATURE_DEVICE_REMOVAL,
 	SPA_FEATURE_OBSOLETE_COUNTS,
 	SPA_FEATURE_POOL_CHECKPOINT,
+	SPA_FEATURE_SPACEMAP_V2,
 	SPA_FEATURES
 } spa_feature_t;
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c	Mon Jul 30 22:39:30 2018	(r336945)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c	Mon Jul 30 22:56:24 2018	(r336946)
@@ -2091,17 +2091,6 @@ metaslab_group_preload(metaslab_group_t *mg)
  *
  * 3. The on-disk size of the space map should actually decrease.
  *
- * Checking the first condition is tricky since we don't want to walk
- * the entire AVL tree calculating the estimated on-disk size. Instead we
- * use the size-ordered range tree in the metaslab and calculate the
- * size required to write out the largest segment in our free tree. If the
- * size required to represent that segment on disk is larger than the space
- * map object then we avoid condensing this map.
- *
- * To determine the second criterion we use a best-case estimate and assume
- * each segment can be represented on-disk as a single 64-bit entry. We refer
- * to this best-case estimate as the space map's minimal form.
- *
  * Unfortunately, we cannot compute the on-disk size of the space map in this
  * context because we cannot accurately compute the effects of compression, etc.
  * Instead, we apply the heuristic described in the block comment for
@@ -2112,9 +2101,6 @@ static boolean_t
 metaslab_should_condense(metaslab_t *msp)
 {
 	space_map_t *sm = msp->ms_sm;
-	range_seg_t *rs;
-	uint64_t size, entries, segsz, object_size, optimal_size, record_size;
-	dmu_object_info_t doi;
 	vdev_t *vd = msp->ms_group->mg_vd;
 	uint64_t vdev_blocksize = 1 << vd->vdev_ashift;
 	uint64_t current_txg = spa_syncing_txg(vd->vdev_spa);
@@ -2140,34 +2126,22 @@ metaslab_should_condense(metaslab_t *msp)
 	msp->ms_condense_checked_txg = current_txg;
 
 	/*
-	 * Use the ms_allocatable_by_size range tree, which is ordered by
-	 * size, to obtain the largest segment in the free tree. We always
-	 * condense metaslabs that are empty and metaslabs for which a
-	 * condense request has been made.
+	 * We always condense metaslabs that are empty and metaslabs for
+	 * which a condense request has been made.
 	 */
-	rs = avl_last(&msp->ms_allocatable_by_size);
-	if (rs == NULL || msp->ms_condense_wanted)
+	if (avl_is_empty(&msp->ms_allocatable_by_size) ||
+	    msp->ms_condense_wanted)
 		return (B_TRUE);
 
-	/*
-	 * Calculate the number of 64-bit entries this segment would
-	 * require when written to disk. If this single segment would be
-	 * larger on-disk than the entire current on-disk structure, then
-	 * clearly condensing will increase the on-disk structure size.
-	 */
-	size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
-	entries = size / (MIN(size, SM_RUN_MAX));
-	segsz = entries * sizeof (uint64_t);
+	uint64_t object_size = space_map_length(msp->ms_sm);
+	uint64_t optimal_size = space_map_estimate_optimal_size(sm,
+	    msp->ms_allocatable, SM_NO_VDEVID);
 
-	optimal_size =
-	    sizeof (uint64_t) * avl_numnodes(&msp->ms_allocatable->rt_root);
-	object_size = space_map_length(msp->ms_sm);
-
+	dmu_object_info_t doi;
 	dmu_object_info_from_db(sm->sm_dbuf, &doi);
-	record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
+	uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
 
-	return (segsz <= object_size &&
-	    object_size >= (optimal_size * zfs_condense_pct / 100) &&
+	return (object_size >= (optimal_size * zfs_condense_pct / 100) &&
 	    object_size > zfs_metaslab_condense_block_threshold * record_size);
 }
 
@@ -2242,11 +2216,11 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_t
 	 * optimal, this is typically close to optimal, and much cheaper to
 	 * compute.
 	 */
-	space_map_write(sm, condense_tree, SM_ALLOC, tx);
+	space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx);
 	range_tree_vacate(condense_tree, NULL, NULL);
 	range_tree_destroy(condense_tree);
 
-	space_map_write(sm, msp->ms_allocatable, SM_FREE, tx);
+	space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx);
 	mutex_enter(&msp->ms_lock);
 	msp->ms_condensing = B_FALSE;
 }
@@ -2358,8 +2332,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		metaslab_condense(msp, txg, tx);
 	} else {
 		mutex_exit(&msp->ms_lock);
-		space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
-		space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, tx);
+		space_map_write(msp->ms_sm, alloctree, SM_ALLOC,
+		    SM_NO_VDEVID, tx);
+		space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE,
+		    SM_NO_VDEVID, tx);
 		mutex_enter(&msp->ms_lock);
 	}
 
@@ -2374,7 +2350,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		 */
 		mutex_exit(&msp->ms_lock);
 		space_map_write(vd->vdev_checkpoint_sm,
-		    msp->ms_checkpointing, SM_FREE, tx);
+		    msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx);
 		mutex_enter(&msp->ms_lock);
 		space_map_update(vd->vdev_checkpoint_sm);
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c	Mon Jul 30 22:39:30 2018	(r336945)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c	Mon Jul 30 22:56:24 2018	(r336946)
@@ -179,7 +179,7 @@ range_tree_add(void *arg, uint64_t start, uint64_t siz
 	}
 
 	/* Make sure we don't overlap with either of our neighbors */
-	VERIFY(rs == NULL);
+	VERIFY3P(rs, ==, NULL);
 
 	rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
 	rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/spa_checkpoint.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/spa_checkpoint.c	Mon Jul 30 22:39:30 2018	(r336945)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/spa_checkpoint.c	Mon Jul 30 22:56:24 2018	(r336946)
@@ -203,13 +203,12 @@ typedef struct spa_checkpoint_discard_sync_callback_ar
 } spa_checkpoint_discard_sync_callback_arg_t;
 
 static int
-spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset,
-    uint64_t size, void *arg)
+spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg)
 {
 	spa_checkpoint_discard_sync_callback_arg_t *sdc = arg;
 	vdev_t *vd = sdc->sdc_vd;
-	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
-	uint64_t end = offset + size;
+	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
+	uint64_t end = sme->sme_offset + sme->sme_run;
 
 	if (sdc->sdc_entry_limit == 0)
 		return (EINTR);
@@ -224,8 +223,8 @@ spa_checkpoint_discard_sync_callback(maptype_t type, u
 	 * metaslab boundaries. So if needed we could add code
 	 * that handles metaslab-crossing segments in the future.
 	 */
-	VERIFY3U(type, ==, SM_FREE);
-	VERIFY3U(offset, >=, ms->ms_start);
+	VERIFY3U(sme->sme_type, ==, SM_FREE);
+	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
 	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
 
 	/*
@@ -237,14 +236,15 @@ spa_checkpoint_discard_sync_callback(maptype_t type, u
 	mutex_enter(&ms->ms_lock);
 	if (range_tree_is_empty(ms->ms_freeing))
 		vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg);
-	range_tree_add(ms->ms_freeing, offset, size);
+	range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run);
 	mutex_exit(&ms->ms_lock);
 
-	ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, size);
-	ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, size);
+	ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=,
+	    sme->sme_run);
+	ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run);
 
-	vd->vdev_spa->spa_checkpoint_info.sci_dspace -= size;
-	vd->vdev_stat.vs_checkpoint_space -= size;
+	vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run;
+	vd->vdev_stat.vs_checkpoint_space -= sme->sme_run;
 	sdc->sdc_entry_limit--;
 
 	return (0);
@@ -289,13 +289,14 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t
 	 * Thus, we set the maximum entries that the space map callback
 	 * will be applied to be half the entries that could fit in the
 	 * imposed memory limit.
+	 *
+	 * Note that since this is a conservative estimate we also
+	 * assume the worst case scenario in our computation where each
+	 * entry is two-word.
 	 */
 	uint64_t max_entry_limit =
-	    (zfs_spa_discard_memory_limit / sizeof (uint64_t)) >> 1;
+	    (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1;
 
-	uint64_t entries_in_sm =
-	    space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
-
 	/*
 	 * Iterate from the end of the space map towards the beginning,
 	 * placing its entries on ms_freeing and removing them from the
@@ -318,14 +319,15 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t
 	spa_checkpoint_discard_sync_callback_arg_t sdc;
 	sdc.sdc_vd = vd;
 	sdc.sdc_txg = tx->tx_txg;
-	sdc.sdc_entry_limit = MIN(entries_in_sm, max_entry_limit);
+	sdc.sdc_entry_limit = max_entry_limit;
 
-	uint64_t entries_before = entries_in_sm;
+	uint64_t words_before =
+	    space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
 
 	error = space_map_incremental_destroy(vd->vdev_checkpoint_sm,
 	    spa_checkpoint_discard_sync_callback, &sdc, tx);
 
-	uint64_t entries_after =
+	uint64_t words_after =
 	    space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
 
 #ifdef DEBUG
@@ -333,9 +335,9 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t
 #endif
 
 	zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, "
-	    "deleted %llu entries - %llu entries are left",
-	    tx->tx_txg, vd->vdev_id, (entries_before - entries_after),
-	    entries_after);
+	    "deleted %llu words - %llu words are left",
+	    tx->tx_txg, vd->vdev_id, (words_before - words_after),
+	    words_after);
 
 	if (error != EINTR) {
 		if (error != 0) {
@@ -344,15 +346,15 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t
 			    "space map of vdev %llu\n",
 			    error, vd->vdev_id);
 		}
-		ASSERT0(entries_after);
+		ASSERT0(words_after);
 		ASSERT0(vd->vdev_checkpoint_sm->sm_alloc);
-		ASSERT0(vd->vdev_checkpoint_sm->sm_length);
+		ASSERT0(space_map_length(vd->vdev_checkpoint_sm));
 
 		space_map_free(vd->vdev_checkpoint_sm, tx);
 		space_map_close(vd->vdev_checkpoint_sm);
 		vd->vdev_checkpoint_sm = NULL;
 
-		VERIFY0(zap_remove(vd->vdev_spa->spa_meta_objset,
+		VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa),
 		    vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx));
 	}
 }

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/space_map.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/space_map.c	Mon Jul 30 22:39:30 2018	(r336945)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/space_map.c	Mon Jul 30 22:56:24 2018	(r336946)
@@ -41,68 +41,194 @@
  * Note on space map block size:
  *
  * The data for a given space map can be kept on blocks of any size.
- * Larger blocks entail fewer i/o operations, but they also cause the
- * DMU to keep more data in-core, and also to waste more i/o bandwidth
+ * Larger blocks entail fewer I/O operations, but they also cause the
+ * DMU to keep more data in-core, and also to waste more I/O bandwidth
  * when only a few blocks have changed since the last transaction group.
  */
 
 /*
+ * Enabled whenever we want to stress test the use of double-word
+ * space map entries.
+ */
+boolean_t zfs_force_some_double_word_sm_entries = B_FALSE;
+
+boolean_t
+sm_entry_is_debug(uint64_t e)
+{
+	return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX);
+}
+
+boolean_t
+sm_entry_is_single_word(uint64_t e)
+{
+	uint8_t prefix = SM_PREFIX_DECODE(e);
+	return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX);
+}
+
+boolean_t
+sm_entry_is_double_word(uint64_t e)
+{
+	return (SM_PREFIX_DECODE(e) == SM2_PREFIX);
+}
+
+/*
  * Iterate through the space map, invoking the callback on each (non-debug)
  * space map entry.
  */
 int
 space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg)
 {
-	uint64_t *entry, *entry_map, *entry_map_end;
-	uint64_t bufsize, size, offset, end;
+	uint64_t sm_len = space_map_length(sm);
+	ASSERT3U(sm->sm_blksz, !=, 0);
+
+	dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, sm_len,
+	    ZIO_PRIORITY_SYNC_READ);
+
+	uint64_t blksz = sm->sm_blksz;
 	int error = 0;
+	for (uint64_t block_base = 0; block_base < sm_len && error == 0;
+	    block_base += blksz) {
+		dmu_buf_t *db;
+		error = dmu_buf_hold(sm->sm_os, space_map_object(sm),
+		    block_base, FTAG, &db, DMU_READ_PREFETCH);
+		if (error != 0)
+			return (error);
 
-	end = space_map_length(sm);
+		uint64_t *block_start = db->db_data;
+		uint64_t block_length = MIN(sm_len - block_base, blksz);
+		uint64_t *block_end = block_start +
+		    (block_length / sizeof (uint64_t));
 
-	bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
-	entry_map = zio_buf_alloc(bufsize);
+		VERIFY0(P2PHASE(block_length, sizeof (uint64_t)));
+		VERIFY3U(block_length, !=, 0);
+		ASSERT3U(blksz, ==, db->db_size);
 
-	if (end > bufsize) {
-		dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize,
-		    end - bufsize, ZIO_PRIORITY_SYNC_READ);
+		for (uint64_t *block_cursor = block_start;
+		    block_cursor < block_end && error == 0; block_cursor++) {
+			uint64_t e = *block_cursor;
+
+			if (sm_entry_is_debug(e)) /* Skip debug entries */
+				continue;
+
+			uint64_t raw_offset, raw_run, vdev_id;
+			maptype_t type;
+			if (sm_entry_is_single_word(e)) {
+				type = SM_TYPE_DECODE(e);
+				vdev_id = SM_NO_VDEVID;
+				raw_offset = SM_OFFSET_DECODE(e);
+				raw_run = SM_RUN_DECODE(e);
+			} else {
+				/* it is a two-word entry */
+				ASSERT(sm_entry_is_double_word(e));
+				raw_run = SM2_RUN_DECODE(e);
+				vdev_id = SM2_VDEV_DECODE(e);
+
+				/* move on to the second word */
+				block_cursor++;
+				e = *block_cursor;
+				VERIFY3P(block_cursor, <=, block_end);
+
+				type = SM2_TYPE_DECODE(e);
+				raw_offset = SM2_OFFSET_DECODE(e);
+			}
+
+			uint64_t entry_offset = (raw_offset << sm->sm_shift) +
+			    sm->sm_start;
+			uint64_t entry_run = raw_run << sm->sm_shift;
+
+			VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
+			VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
+			ASSERT3U(entry_offset, >=, sm->sm_start);
+			ASSERT3U(entry_offset, <, sm->sm_start + sm->sm_size);
+			ASSERT3U(entry_run, <=, sm->sm_size);
+			ASSERT3U(entry_offset + entry_run, <=,
+			    sm->sm_start + sm->sm_size);
+
+			space_map_entry_t sme = {
+			    .sme_type = type,
+			    .sme_vdev = vdev_id,
+			    .sme_offset = entry_offset,
+			    .sme_run = entry_run
+			};
+			error = callback(&sme, arg);
+		}
+		dmu_buf_rele(db, FTAG);
 	}
+	return (error);
+}
 
-	for (offset = 0; offset < end && error == 0; offset += bufsize) {
-		size = MIN(end - offset, bufsize);
-		VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
-		VERIFY(size != 0);
-		ASSERT3U(sm->sm_blksz, !=, 0);
+/*
+ * Reads the entries from the last block of the space map into
+ * buf in reverse order. Populates nwords with number of words
+ * in the last block.
+ *
+ * Refer to block comment within space_map_incremental_destroy()
+ * to understand why this function is needed.
+ */
+static int
+space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf,
+    uint64_t bufsz, uint64_t *nwords)
+{
+	int error = 0;
+	dmu_buf_t *db;
 
-		dprintf("object=%llu  offset=%llx  size=%llx\n",
-		    space_map_object(sm), offset, size);
+	/*
+	 * Find the offset of the last word in the space map and use
+	 * that to read the last block of the space map with
+	 * dmu_buf_hold().
+	 */
+	uint64_t last_word_offset =
+	    sm->sm_phys->smp_objsize - sizeof (uint64_t);
+	error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset,
+	    FTAG, &db, DMU_READ_NO_PREFETCH);
+	if (error != 0)
+		return (error);
 
-		error = dmu_read(sm->sm_os, space_map_object(sm), offset, size,
-		    entry_map, DMU_READ_PREFETCH);
-		if (error != 0)
-			break;
+	ASSERT3U(sm->sm_object, ==, db->db_object);
+	ASSERT3U(sm->sm_blksz, ==, db->db_size);
+	ASSERT3U(bufsz, >=, db->db_size);
+	ASSERT(nwords != NULL);
 
-		entry_map_end = entry_map + (size / sizeof (uint64_t));
-		for (entry = entry_map; entry < entry_map_end && error == 0;
-		    entry++) {
-			uint64_t e = *entry;
-			uint64_t offset, size;
+	uint64_t *words = db->db_data;
+	*nwords =
+	    (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
 
-			if (SM_DEBUG_DECODE(e))	/* Skip debug entries */
-				continue;
+	ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t));
 
-			offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
-			    sm->sm_start;
-			size = SM_RUN_DECODE(e) << sm->sm_shift;
+	uint64_t n = *nwords;
+	uint64_t j = n - 1;
+	for (uint64_t i = 0; i < n; i++) {
+		uint64_t entry = words[i];
+		if (sm_entry_is_double_word(entry)) {
+			/*
+			 * Since we are populating the buffer backwards
+			 * we have to be extra careful and add the two
+			 * words of the double-word entry in the right
+			 * order.
+			 */
+			ASSERT3U(j, >, 0);
+			buf[j - 1] = entry;
 
-			VERIFY0(P2PHASE(offset, 1ULL << sm->sm_shift));
-			VERIFY0(P2PHASE(size, 1ULL << sm->sm_shift));
-			VERIFY3U(offset, >=, sm->sm_start);
-			VERIFY3U(offset + size, <=, sm->sm_start + sm->sm_size);
-			error = callback(SM_TYPE_DECODE(e), offset, size, arg);
+			i++;
+			ASSERT3U(i, <, n);
+			entry = words[i];
+			buf[j] = entry;
+			j -= 2;
+		} else {
+			ASSERT(sm_entry_is_debug(entry) ||
+			    sm_entry_is_single_word(entry));
+			buf[j] = entry;
+			j--;
 		}
 	}
 
-	zio_buf_free(entry_map, bufsize);
+	/*
+	 * Assert that we wrote backwards all the
+	 * way to the beginning of the buffer.
+	 */
+	ASSERT3S(j, ==, -1);
+
+	dmu_buf_rele(db, FTAG);
 	return (error);
 }
 
@@ -116,124 +242,122 @@ int
 space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg,
     dmu_tx_t *tx)
 {
-	uint64_t bufsize, len;
-	uint64_t *entry_map;
-	int error = 0;
+	uint64_t bufsz = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
+	uint64_t *buf = zio_buf_alloc(bufsz);
 
-	len = space_map_length(sm);
-	bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
-	entry_map = zio_buf_alloc(bufsize);
-
 	dmu_buf_will_dirty(sm->sm_dbuf, tx);
 
 	/*
-	 * Since we can't move the starting offset of the space map
-	 * (e.g there are reference on-disk pointing to it), we destroy
-	 * its entries incrementally starting from the end.
+	 * Ideally we would want to iterate from the beginning of the
+	 * space map to the end in incremental steps. The issue with this
+	 * approach is that we don't have any field on-disk that points
+	 * us where to start between each step. We could try zeroing out
+	 * entries that we've destroyed, but this doesn't work either as
+	 * an entry that is 0 is a valid one (ALLOC for range [0x0:0x200]).
 	 *
-	 * The logic that follows is basically the same as the one used
-	 * in space_map_iterate() but it traverses the space map
-	 * backwards:
+	 * As a result, we destroy its entries incrementally starting from
+	 * the end after applying the callback to each of them.
 	 *
-	 * 1] We figure out the size of the buffer that we want to use
-	 *    to read the on-disk space map entries.
-	 * 2] We figure out the offset at the end of the space map where
-	 *    we will start reading entries into our buffer.
-	 * 3] We read the on-disk entries into the buffer.
-	 * 4] We iterate over the entries from end to beginning calling
-	 *    the callback function on each one. As we move from entry
-	 *    to entry we decrease the size of the space map, deleting
-	 *    effectively each entry.
-	 * 5] If there are no more entries in the space map or the
-	 *    callback returns a value other than 0, we stop iterating
-	 *    over the space map. If there are entries remaining and
-	 *    the callback returned zero we go back to step [1].
+	 * The problem with this approach is that we cannot literally
+	 * iterate through the words in the space map backwards as we
+	 * can't distinguish two-word space map entries from their second
+	 * word. Thus we do the following:
+	 *
+	 * 1] We get all the entries from the last block of the space map
+	 *    and put them into a buffer in reverse order. This way the
+	 *    last entry comes first in the buffer, the second to last is
+	 *    second, etc.
+	 * 2] We iterate through the entries in the buffer and we apply
+	 *    the callback to each one. As we move from entry to entry we
+	 *    we decrease the size of the space map, deleting effectively
+	 *    each entry.
+	 * 3] If there are no more entries in the space map or the callback
+	 *    returns a value other than 0, we stop iterating over the
+	 *    space map. If there are entries remaining and the callback
+	 *    returned 0, we go back to step [1].
 	 */
-	uint64_t offset = 0, size = 0;
-	while (len > 0 && error == 0) {
-		size = MIN(bufsize, len);
-
-		VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
-		VERIFY3U(size, >, 0);
-		ASSERT3U(sm->sm_blksz, !=, 0);
-
-		offset = len - size;
-
-		IMPLY(bufsize > len, offset == 0);
-		IMPLY(bufsize == len, offset == 0);
-		IMPLY(bufsize < len, offset > 0);
-
-
-		EQUIV(size == len, offset == 0);
-		IMPLY(size < len, bufsize < len);
-
-		dprintf("object=%llu  offset=%llx  size=%llx\n",
-		    space_map_object(sm), offset, size);
-
-		error = dmu_read(sm->sm_os, space_map_object(sm),
-		    offset, size, entry_map, DMU_READ_PREFETCH);
+	int error = 0;
+	while (space_map_length(sm) > 0 && error == 0) {
+		uint64_t nwords = 0;
+		error = space_map_reversed_last_block_entries(sm, buf, bufsz,
+		    &nwords);
 		if (error != 0)
 			break;
 
-		uint64_t num_entries = size / sizeof (uint64_t);
+		ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t));
 
-		ASSERT3U(num_entries, >, 0);
+		for (uint64_t i = 0; i < nwords; i++) {
+			uint64_t e = buf[i];
 
-		while (num_entries > 0) {
-			uint64_t e, entry_offset, entry_size;
+			if (sm_entry_is_debug(e)) {
+				sm->sm_phys->smp_objsize -= sizeof (uint64_t);
+				space_map_update(sm);
+				continue;
+			}
+
+			int words = 1;
+			uint64_t raw_offset, raw_run, vdev_id;
 			maptype_t type;
+			if (sm_entry_is_single_word(e)) {
+				type = SM_TYPE_DECODE(e);
+				vdev_id = SM_NO_VDEVID;
+				raw_offset = SM_OFFSET_DECODE(e);
+				raw_run = SM_RUN_DECODE(e);
+			} else {
+				ASSERT(sm_entry_is_double_word(e));
+				words = 2;
 
-			e = entry_map[num_entries - 1];
+				raw_run = SM2_RUN_DECODE(e);
+				vdev_id = SM2_VDEV_DECODE(e);
 
-			ASSERT3U(num_entries, >, 0);
-			ASSERT0(error);
+				/* move to the second word */
+				i++;
+				e = buf[i];
 
-			if (SM_DEBUG_DECODE(e)) {
-				sm->sm_phys->smp_objsize -= sizeof (uint64_t);
-				space_map_update(sm);
-				len -= sizeof (uint64_t);
-				num_entries--;
-				continue;
+				ASSERT3P(i, <=, nwords);
+
+				type = SM2_TYPE_DECODE(e);
+				raw_offset = SM2_OFFSET_DECODE(e);
 			}
 
-			type = SM_TYPE_DECODE(e);
-			entry_offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
-			    sm->sm_start;
-			entry_size = SM_RUN_DECODE(e) << sm->sm_shift;
+			uint64_t entry_offset =
+			    (raw_offset << sm->sm_shift) + sm->sm_start;
+			uint64_t entry_run = raw_run << sm->sm_shift;
 
 			VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift));
-			VERIFY0(P2PHASE(entry_size, 1ULL << sm->sm_shift));
+			VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift));
 			VERIFY3U(entry_offset, >=, sm->sm_start);
-			VERIFY3U(entry_offset + entry_size, <=,
+			VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size);
+			VERIFY3U(entry_run, <=, sm->sm_size);
+			VERIFY3U(entry_offset + entry_run, <=,
 			    sm->sm_start + sm->sm_size);
 
-			error = callback(type, entry_offset, entry_size, arg);
+			space_map_entry_t sme = {
+			    .sme_type = type,
+			    .sme_vdev = vdev_id,
+			    .sme_offset = entry_offset,
+			    .sme_run = entry_run
+			};
+			error = callback(&sme, arg);
 			if (error != 0)
 				break;
 
 			if (type == SM_ALLOC)
-				sm->sm_phys->smp_alloc -= entry_size;
+				sm->sm_phys->smp_alloc -= entry_run;
 			else
-				sm->sm_phys->smp_alloc += entry_size;
-
-			sm->sm_phys->smp_objsize -= sizeof (uint64_t);
+				sm->sm_phys->smp_alloc += entry_run;
+			sm->sm_phys->smp_objsize -= words * sizeof (uint64_t);
 			space_map_update(sm);
-			len -= sizeof (uint64_t);
-			num_entries--;
 		}
-		IMPLY(error == 0, num_entries == 0);
-		EQUIV(offset == 0 && error == 0, len == 0 && num_entries == 0);
 	}
 
-	if (len == 0) {
+	if (space_map_length(sm) == 0) {
 		ASSERT0(error);
-		ASSERT0(offset);
-		ASSERT0(sm->sm_length);
 		ASSERT0(sm->sm_phys->smp_objsize);
 		ASSERT0(sm->sm_alloc);
 	}
 
-	zio_buf_free(entry_map, bufsize);
+	zio_buf_free(buf, bufsz);
 	return (error);
 }
 
@@ -244,16 +368,15 @@ typedef struct space_map_load_arg {
 } space_map_load_arg_t;
 
 static int
-space_map_load_callback(maptype_t type, uint64_t offset, uint64_t size,
-    void *arg)
+space_map_load_callback(space_map_entry_t *sme, void *arg)
 {
 	space_map_load_arg_t *smla = arg;
-	if (type == smla->smla_type) {
-		VERIFY3U(range_tree_space(smla->smla_rt) + size, <=,
+	if (sme->sme_type == smla->smla_type) {
+		VERIFY3U(range_tree_space(smla->smla_rt) + sme->sme_run, <=,
 		    smla->smla_sm->sm_size);
-		range_tree_add(smla->smla_rt, offset, size);
+		range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run);
 	} else {
-		range_tree_remove(smla->smla_rt, offset, size);
+		range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run);
 	}
 
 	return (0);
@@ -365,43 +488,239 @@ space_map_histogram_add(space_map_t *sm, range_tree_t 
 	}
 }
 
-uint64_t
-space_map_entries(space_map_t *sm, range_tree_t *rt)
+static void
+space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx)
 {
-	avl_tree_t *t = &rt->rt_root;
-	range_seg_t *rs;
-	uint64_t size, entries;
+	dmu_buf_will_dirty(sm->sm_dbuf, tx);
 
+	uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
+	    SM_DEBUG_ACTION_ENCODE(maptype) |
+	    SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) |
+	    SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
+
+	dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_objsize,
+	    sizeof (dentry), &dentry, tx);
+
+	sm->sm_phys->smp_objsize += sizeof (dentry);
+}
+
+/*
+ * Writes one or more entries given a segment.
+ *
+ * Note: The function may release the dbuf from the pointer initially
+ * passed to it, and return a different dbuf. Also, the space map's
+ * dbuf must be dirty for the changes in sm_phys to take effect.
+ */
+static void
+space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype,
+    uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, void *tag, dmu_tx_t *tx)
+{
+	ASSERT3U(words, !=, 0);
+	ASSERT3U(words, <=, 2);
+
+	/* ensure the vdev_id can be represented by the space map */
+	ASSERT3U(vdev_id, <=, SM_NO_VDEVID);
+
 	/*
-	 * All space_maps always have a debug entry so account for it here.
+	 * if this is a single word entry, ensure that no vdev was
+	 * specified.
 	 */
-	entries = 1;
+	IMPLY(words == 1, vdev_id == SM_NO_VDEVID);
 
+	dmu_buf_t *db = *dbp;
+	ASSERT3U(db->db_size, ==, sm->sm_blksz);
+
+	uint64_t *block_base = db->db_data;
+	uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t));
+	uint64_t *block_cursor = block_base +
+	    (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t);
+
+	ASSERT3P(block_cursor, <=, block_end);
+
+	uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
+	uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
+	uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX;
+
+	ASSERT3U(rs->rs_start, >=, sm->sm_start);
+	ASSERT3U(rs->rs_start, <, sm->sm_start + sm->sm_size);
+	ASSERT3U(rs->rs_end - rs->rs_start, <=, sm->sm_size);
+	ASSERT3U(rs->rs_end, <=, sm->sm_start + sm->sm_size);
+
+	while (size != 0) {
+		ASSERT3P(block_cursor, <=, block_end);
+
+		/*
+		 * If we are at the end of this block, flush it and start
+		 * writing again from the beginning.
+		 */
+		if (block_cursor == block_end) {
+			dmu_buf_rele(db, tag);
+
+			uint64_t next_word_offset = sm->sm_phys->smp_objsize;
+			VERIFY0(dmu_buf_hold(sm->sm_os,
+			    space_map_object(sm), next_word_offset,
+			    tag, &db, DMU_READ_PREFETCH));
+			dmu_buf_will_dirty(db, tx);
+
+			/* update caller's dbuf */
+			*dbp = db;
+
+			ASSERT3U(db->db_size, ==, sm->sm_blksz);
+
+			block_base = db->db_data;
+			block_cursor = block_base;
+			block_end = block_base +
+			    (db->db_size / sizeof (uint64_t));
+		}
+
+		/*
+		 * If we are writing a two-word entry and we only have one
+		 * word left on this block, just pad it with an empty debug
+		 * entry and write the two-word entry in the next block.
+		 */
+		uint64_t *next_entry = block_cursor + 1;
+		if (next_entry == block_end && words > 1) {
+			ASSERT3U(words, ==, 2);
+			*block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) |
+			    SM_DEBUG_ACTION_ENCODE(0) |
+			    SM_DEBUG_SYNCPASS_ENCODE(0) |
+			    SM_DEBUG_TXG_ENCODE(0);
+			block_cursor++;
+			sm->sm_phys->smp_objsize += sizeof (uint64_t);
+			ASSERT3P(block_cursor, ==, block_end);
+			continue;
+		}
+
+		uint64_t run_len = MIN(size, run_max);
+		switch (words) {
+		case 1:
+			*block_cursor = SM_OFFSET_ENCODE(start) |
+			    SM_TYPE_ENCODE(maptype) |
+			    SM_RUN_ENCODE(run_len);
+			block_cursor++;
+			break;
+		case 2:
+			/* write the first word of the entry */
+			*block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) |
+			    SM2_RUN_ENCODE(run_len) |
+			    SM2_VDEV_ENCODE(vdev_id);
+			block_cursor++;
+
+			/* move on to the second word of the entry */
+			ASSERT3P(block_cursor, <, block_end);
+			*block_cursor = SM2_TYPE_ENCODE(maptype) |
+			    SM2_OFFSET_ENCODE(start);
+			block_cursor++;
+			break;
+		default:
+			panic("%d-word space map entries are not supported",
+			    words);
+			break;
+		}
+		sm->sm_phys->smp_objsize += words * sizeof (uint64_t);
+
+		start += run_len;
+		size -= run_len;
+	}
+	ASSERT0(size);
+
+}
+
+/*
+ * Note: The space map's dbuf must be dirty for the changes in sm_phys to
+ * take effect.
+ */
+static void
+space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+    uint64_t vdev_id, dmu_tx_t *tx)
+{
+	spa_t *spa = tx->tx_pool->dp_spa;
+	dmu_buf_t *db;
+
+	space_map_write_intro_debug(sm, maptype, tx);
+
+#ifdef DEBUG
 	/*
-	 * Traverse the range tree and calculate the number of space map
-	 * entries that would be required to write out the range tree.
+	 * We do this right after we write the intro debug entry
+	 * because the estimate does not take it into account.
 	 */
-	for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
-		size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
-		entries += howmany(size, SM_RUN_MAX);
+	uint64_t initial_objsize = sm->sm_phys->smp_objsize;
+	uint64_t estimated_growth =
+	    space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID);
+	uint64_t estimated_final_objsize = initial_objsize + estimated_growth;
+#endif
+
+	/*
+	 * Find the offset right after the last word in the space map
+	 * and use that to get a hold of the last block, so we can
+	 * start appending to it.
+	 */
+	uint64_t next_word_offset = sm->sm_phys->smp_objsize;
+	VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm),
+	    next_word_offset, FTAG, &db, DMU_READ_PREFETCH));
+	ASSERT3U(db->db_size, ==, sm->sm_blksz);
+
+	dmu_buf_will_dirty(db, tx);
+
+	avl_tree_t *t = &rt->rt_root;
+	for (range_seg_t *rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
+		uint64_t offset = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
+		uint64_t length = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
+		uint8_t words = 1;
+
+		/*
+		 * We only write two-word entries when both of the following
+		 * are true:
+		 *
+		 * [1] The feature is enabled.
+		 * [2] The offset or run is too big for a single-word entry,
+		 * 	or the vdev_id is set (meaning not equal to
+		 * 	SM_NO_VDEVID).
+		 *
+		 * Note that for purposes of testing we've added the case that
+		 * we write two-word entries occasionally when the feature is
+		 * enabled and zfs_force_some_double_word_sm_entries has been
+		 * set.
+		 */
+		if (spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_V2) &&
+		    (offset >= (1ULL << SM_OFFSET_BITS) ||
+		    length > SM_RUN_MAX ||
+		    vdev_id != SM_NO_VDEVID ||
+		    (zfs_force_some_double_word_sm_entries &&
+		    spa_get_random(100) == 0)))
+			words = 2;
+
+		space_map_write_seg(sm, rs, maptype, vdev_id, words,
+		    &db, FTAG, tx);
 	}
-	return (entries);
+
+	dmu_buf_rele(db, FTAG);
+
+#ifdef DEBUG
+	/*
+	 * We expect our estimation to be based on the worst case
+	 * scenario [see comment in space_map_estimate_optimal_size()].
+	 * Therefore we expect the actual objsize to be equal or less
+	 * than whatever we estimated it to be.
+	 */
+	ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_objsize);
+#endif
 }
 
+/*
+ * Note: This function manipulates the state of the given space map but
+ * does not hold any locks implicitly. Thus the caller is responsible
+ * for synchronizing writes to the space map.
+ */
 void
 space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
-    dmu_tx_t *tx)
+    uint64_t vdev_id, dmu_tx_t *tx)
 {
 	objset_t *os = sm->sm_os;
-	spa_t *spa = dmu_objset_spa(os);
-	avl_tree_t *t = &rt->rt_root;
-	range_seg_t *rs;
-	uint64_t size, total, rt_space, nodes;
-	uint64_t *entry, *entry_map, *entry_map_end;
-	uint64_t expected_entries, actual_entries = 1;
 
 	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
 	VERIFY3U(space_map_object(sm), !=, 0);
+
 	dmu_buf_will_dirty(sm->sm_dbuf, tx);
 
 	/*
@@ -421,68 +740,17 @@ space_map_write(space_map_t *sm, range_tree_t *rt, map
 	else
 		sm->sm_phys->smp_alloc -= range_tree_space(rt);
 
-	expected_entries = space_map_entries(sm, rt);
+	uint64_t nodes = avl_numnodes(&rt->rt_root);
+	uint64_t rt_space = range_tree_space(rt);
 
-	entry_map = zio_buf_alloc(sm->sm_blksz);
-	entry_map_end = entry_map + (sm->sm_blksz / sizeof (uint64_t));
-	entry = entry_map;
+	space_map_write_impl(sm, rt, maptype, vdev_id, tx);

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201807302256.w6UMuPvv043700>