Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 30 Jul 2018 22:56:26 +0000 (UTC)
From:      Alexander Motin <mav@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org
Subject:   svn commit: r336946 - vendor-sys/illumos/dist/common/zfs vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/sys vendor/illumos/dist/cmd/zdb vendor/illumos/dist/cmd/...
Message-ID:  <201807302256.w6UMuQMD043714@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mav
Date: Mon Jul 30 22:56:24 2018
New Revision: 336946
URL: https://svnweb.freebsd.org/changeset/base/336946

Log:
  9238 ZFS Spacemap Encoding V2
  
  The current space map encoding has the following disadvantages:
  [1] Assuming 512 sector size each entry can represent at most 16MB for a segment.
  This makes the encoding very inefficient for large regions of space.
  [2] As vdev-wide space maps have started to be used by new features (i.e.
  device removal, zpool checkpoint) we've started imposing limits in the
  vdevs that can be used with them based on the maximum addressable offset
  (currently 64PB for a top-level vdev).
  
  The new remains backwards compatible with the old one. The introduced
  two-word entry format, besides extending the limits imposed by the single-entry
  layout, also includes a vdev field and some extra padding after its prefix.
  
  The extra padding after the prefix should is reserved for future usage (e.g.
  new prefixes for future encodings or new fields for flags). The new vdev field
  not only makes the space maps more self-descriptive, but also opens the doors
  for pool-wide space maps.
  
  One final important note is that the number of bits used for vdevs is reduced
  to 24 bits for blkptrs. That was decided as we don't know of any setups that
  use more than 16M vdevs for the time being and
  we wanted to fit the vdev field in the space map. In addition that gives us
  some extra bits in dva_t.
  
  illumos/illumos-gate@17f11284b49b98353b5119463254074fd9bc0a28
  
  Reviewed by: Matt Ahrens <mahrens@delphix.com>
  Reviewed by: George Wilson <gwilson@zfsmail.com>
  Approved by: Gordon Ross <gwr@nexenta.com>
  Author: Serapheim Dimitropoulos <serapheim@delphix.com>

Modified:
  vendor/illumos/dist/cmd/zdb/zdb.c
  vendor/illumos/dist/cmd/ztest/ztest.c
  vendor/illumos/dist/man/man5/zpool-features.5

Changes in other areas also in this revision:
Modified:
  vendor-sys/illumos/dist/common/zfs/zfeature_common.c
  vendor-sys/illumos/dist/common/zfs/zfeature_common.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/spa_checkpoint.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/space_map.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/space_map.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect_mapping.c

Modified: vendor/illumos/dist/cmd/zdb/zdb.c
==============================================================================
--- vendor/illumos/dist/cmd/zdb/zdb.c	Mon Jul 30 22:39:30 2018	(r336945)
+++ vendor/illumos/dist/cmd/zdb/zdb.c	Mon Jul 30 22:56:24 2018	(r336946)
@@ -774,7 +774,6 @@ verify_spacemap_refcounts(spa_t *spa)
 static void
 dump_spacemap(objset_t *os, space_map_t *sm)
 {
-	uint64_t alloc, offset, entry;
 	char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID",
 	    "INVALID", "INVALID", "INVALID", "INVALID" };
 
@@ -791,41 +790,73 @@ dump_spacemap(objset_t *os, space_map_t *sm)
 	/*
 	 * Print out the freelist entries in both encoded and decoded form.
 	 */
-	alloc = 0;
-	for (offset = 0; offset < space_map_length(sm);
-	    offset += sizeof (entry)) {
-		uint8_t mapshift = sm->sm_shift;
+	uint8_t mapshift = sm->sm_shift;
+	int64_t alloc = 0;
+	uint64_t word;
+	for (uint64_t offset = 0; offset < space_map_length(sm);
+	    offset += sizeof (word)) {
 
 		VERIFY0(dmu_read(os, space_map_object(sm), offset,
-		    sizeof (entry), &entry, DMU_READ_PREFETCH));
-		if (SM_DEBUG_DECODE(entry)) {
+		    sizeof (word), &word, DMU_READ_PREFETCH));
 
+		if (sm_entry_is_debug(word)) {
 			(void) printf("\t    [%6llu] %s: txg %llu, pass %llu\n",
-			    (u_longlong_t)(offset / sizeof (entry)),
-			    ddata[SM_DEBUG_ACTION_DECODE(entry)],
-			    (u_longlong_t)SM_DEBUG_TXG_DECODE(entry),
-			    (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry));
+			    (u_longlong_t)(offset / sizeof (word)),
+			    ddata[SM_DEBUG_ACTION_DECODE(word)],
+			    (u_longlong_t)SM_DEBUG_TXG_DECODE(word),
+			    (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word));
+			continue;
+		}
+
+		uint8_t words;
+		char entry_type;
+		uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID;
+
+		if (sm_entry_is_single_word(word)) {
+			entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ?
+			    'A' : 'F';
+			entry_off = (SM_OFFSET_DECODE(word) << mapshift) +
+			    sm->sm_start;
+			entry_run = SM_RUN_DECODE(word) << mapshift;
+			words = 1;
 		} else {
-			(void) printf("\t    [%6llu]    %c  range:"
-			    " %010llx-%010llx  size: %06llx\n",
-			    (u_longlong_t)(offset / sizeof (entry)),
-			    SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
-			    (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
-			    mapshift) + sm->sm_start),
-			    (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
-			    mapshift) + sm->sm_start +
-			    (SM_RUN_DECODE(entry) << mapshift)),
-			    (u_longlong_t)(SM_RUN_DECODE(entry) << mapshift));
-			if (SM_TYPE_DECODE(entry) == SM_ALLOC)
-				alloc += SM_RUN_DECODE(entry) << mapshift;
-			else
-				alloc -= SM_RUN_DECODE(entry) << mapshift;
+			/* it is a two-word entry so we read another word */
+			ASSERT(sm_entry_is_double_word(word));
+
+			uint64_t extra_word;
+			offset += sizeof (extra_word);
+			VERIFY0(dmu_read(os, space_map_object(sm), offset,
+			    sizeof (extra_word), &extra_word,
+			    DMU_READ_PREFETCH));
+
+			ASSERT3U(offset, <=, space_map_length(sm));
+
+			entry_run = SM2_RUN_DECODE(word) << mapshift;
+			entry_vdev = SM2_VDEV_DECODE(word);
+			entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ?
+			    'A' : 'F';
+			entry_off = (SM2_OFFSET_DECODE(extra_word) <<
+			    mapshift) + sm->sm_start;
+			words = 2;
 		}
+
+		(void) printf("\t    [%6llu]    %c  range:"
+		    " %010llx-%010llx  size: %06llx vdev: %06llu words: %u\n",
+		    (u_longlong_t)(offset / sizeof (word)),
+		    entry_type, (u_longlong_t)entry_off,
+		    (u_longlong_t)(entry_off + entry_run),
+		    (u_longlong_t)entry_run,
+		    (u_longlong_t)entry_vdev, words);
+
+		if (entry_type == 'A')
+			alloc += entry_run;
+		else
+			alloc -= entry_run;
 	}
-	if (alloc != space_map_allocated(sm)) {
-		(void) printf("space_map_object alloc (%llu) INCONSISTENT "
-		    "with space map summary (%llu)\n",
-		    (u_longlong_t)space_map_allocated(sm), (u_longlong_t)alloc);
+	if ((uint64_t)alloc != space_map_allocated(sm)) {
+		(void) printf("space_map_object alloc (%lld) INCONSISTENT "
+		    "with space map summary (%lld)\n",
+		    (longlong_t)space_map_allocated(sm), (longlong_t)alloc);
 	}
 }
 
@@ -1153,7 +1184,7 @@ dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_clas
 	while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
 		dump_dde(ddt, &dde, walk);
 
-	ASSERT(error == ENOENT);
+	ASSERT3U(error, ==, ENOENT);
 
 	(void) printf("\n");
 }
@@ -3068,15 +3099,14 @@ typedef struct checkpoint_sm_exclude_entry_arg {
 } checkpoint_sm_exclude_entry_arg_t;
 
 static int
-checkpoint_sm_exclude_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
-    void *arg)
+checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg)
 {
 	checkpoint_sm_exclude_entry_arg_t *cseea = arg;
 	vdev_t *vd = cseea->cseea_vd;
-	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
-	uint64_t end = offset + size;
+	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
+	uint64_t end = sme->sme_offset + sme->sme_run;
 
-	ASSERT(type == SM_FREE);
+	ASSERT(sme->sme_type == SM_FREE);
 
 	/*
 	 * Since the vdev_checkpoint_sm exists in the vdev level
@@ -3094,7 +3124,7 @@ checkpoint_sm_exclude_entry_cb(maptype_t type, uint64_
 	 * metaslab boundaries. So if needed we could add code
 	 * that handles metaslab-crossing segments in the future.
 	 */
-	VERIFY3U(offset, >=, ms->ms_start);
+	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
 	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
 
 	/*
@@ -3102,10 +3132,10 @@ checkpoint_sm_exclude_entry_cb(maptype_t type, uint64_
 	 * also verify that the entry is there to begin with.
 	 */
 	mutex_enter(&ms->ms_lock);
-	range_tree_remove(ms->ms_allocatable, offset, size);
+	range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
 	mutex_exit(&ms->ms_lock);
 
-	cseea->cseea_checkpoint_size += size;
+	cseea->cseea_checkpoint_size += sme->sme_run;
 	return (0);
 }
 
@@ -4080,15 +4110,14 @@ typedef struct verify_checkpoint_sm_entry_cb_arg {
 #define	ENTRIES_PER_PROGRESS_UPDATE 10000
 
 static int
-verify_checkpoint_sm_entry_cb(maptype_t type, uint64_t offset, uint64_t size,
-    void *arg)
+verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg)
 {
 	verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg;
 	vdev_t *vd = vcsec->vcsec_vd;
-	metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift];
-	uint64_t end = offset + size;
+	metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
+	uint64_t end = sme->sme_offset + sme->sme_run;
 
-	ASSERT(type == SM_FREE);
+	ASSERT(sme->sme_type == SM_FREE);
 
 	if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) {
 		(void) fprintf(stderr,
@@ -4102,7 +4131,7 @@ verify_checkpoint_sm_entry_cb(maptype_t type, uint64_t
 	/*
 	 * See comment in checkpoint_sm_exclude_entry_cb()
 	 */
-	VERIFY3U(offset, >=, ms->ms_start);
+	VERIFY3U(sme->sme_offset, >=, ms->ms_start);
 	VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
 
 	/*
@@ -4111,7 +4140,7 @@ verify_checkpoint_sm_entry_cb(maptype_t type, uint64_t
 	 * their respective ms_allocateable trees should not contain them.
 	 */
 	mutex_enter(&ms->ms_lock);
-	range_tree_verify(ms->ms_allocatable, offset, size);
+	range_tree_verify(ms->ms_allocatable, sme->sme_offset, sme->sme_run);
 	mutex_exit(&ms->ms_lock);
 
 	return (0);
@@ -4357,7 +4386,7 @@ verify_checkpoint(spa_t *spa)
 	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
 	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
 
-	if (error == ENOENT) {
+	if (error == ENOENT && !dump_opt['L']) {
 		/*
 		 * If the feature is active but the uberblock is missing
 		 * then we must be in the middle of discarding the
@@ -4380,7 +4409,7 @@ verify_checkpoint(spa_t *spa)
 		error = 3;
 	}
 
-	if (error == 0)
+	if (error == 0 && !dump_opt['L'])
 		verify_checkpoint_blocks(spa);
 
 	return (error);
@@ -4485,7 +4514,7 @@ dump_zpool(spa_t *spa)
 	if (dump_opt['h'])
 		dump_history(spa);
 
-	if (rc == 0 && !dump_opt['L'])
+	if (rc == 0)
 		rc = verify_checkpoint(spa);
 
 	if (rc != 0) {

Modified: vendor/illumos/dist/cmd/ztest/ztest.c
==============================================================================
--- vendor/illumos/dist/cmd/ztest/ztest.c	Mon Jul 30 22:39:30 2018	(r336945)
+++ vendor/illumos/dist/cmd/ztest/ztest.c	Mon Jul 30 22:56:24 2018	(r336946)
@@ -193,6 +193,7 @@ extern uint64_t zfs_deadman_synctime_ms;
 extern int metaslab_preload_limit;
 extern boolean_t zfs_compressed_arc_enabled;
 extern boolean_t zfs_abd_scatter_enabled;
+extern boolean_t zfs_force_some_double_word_sm_entries;
 
 static ztest_shared_opts_t *ztest_shared_opts;
 static ztest_shared_opts_t ztest_opts;
@@ -6390,6 +6391,12 @@ main(int argc, char **argv)
 
 	dprintf_setup(&argc, argv);
 	zfs_deadman_synctime_ms = 300000;
+	/*
+	 * As two-word space map entries may not come up often (especially
+	 * if pool and vdev sizes are small) we want to force at least some
+	 * of them so the feature get tested.
+	 */
+	zfs_force_some_double_word_sm_entries = B_TRUE;
 
 	ztest_fd_rand = open("/dev/urandom", O_RDONLY);
 	ASSERT3S(ztest_fd_rand, >=, 0);

Modified: vendor/illumos/dist/man/man5/zpool-features.5
==============================================================================
--- vendor/illumos/dist/man/man5/zpool-features.5	Mon Jul 30 22:39:30 2018	(r336945)
+++ vendor/illumos/dist/man/man5/zpool-features.5	Mon Jul 30 22:56:24 2018	(r336946)
@@ -423,7 +423,6 @@ This feature becomes \fBactive\fR as soon as it is ena
 never return to being \fBenabled\fR.
 
 .RE
-
 .sp
 .ne 2
 .na
@@ -488,6 +487,34 @@ This feature becomes \fBactive\fR when the "zpool chec
 is used to checkpoint the pool.
 The feature will only return back to being \fBenabled\fR when the pool
 is rewound or the checkpoint has been discarded.
+
+.RE
+.sp
+.ne 2
+.na
+\fB\fBspacemap_v2\fR\fR
+.ad
+.RS 4n
+.TS
+l l .
+GUID	com.delphix:spacemap_v2
+READ\-ONLY COMPATIBLE	yes
+DEPENDENCIES	none
+.TE
+
+This feature enables the use of the new space map encoding which
+consists of two words (instead of one) whenever it is advantageous.
+The new encoding allows space maps to represent large regions of
+space more efficiently on-disk while also increasing their maximum
+addressable offset.
+
+This feature becomes \fBactive\fR once it is \fBenabled\fR, and never
+returns back to being \fBenabled\fR.
+
+.RE
+.sp
+.ne 2
+.na
 \fB\fBlarge_blocks\fR\fR
 .ad
 .RS 4n



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201807302256.w6UMuQMD043714>