Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 30 Oct 2019 16:49:44 +0000 (UTC)
From:      Andriy Gapon <avg@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-12@freebsd.org
Subject:   svn commit: r354187 - in stable/12: cddl/contrib/opensolaris/cmd/zdb sys/cddl/contrib/opensolaris/uts/common/fs/zfs sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys
Message-ID:  <201910301649.x9UGni55041212@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: avg
Date: Wed Oct 30 16:49:44 2019
New Revision: 354187
URL: https://svnweb.freebsd.org/changeset/base/354187

Log:
  MFC r353612: MFV r353611: 10330 merge recent ZoL vdev and metaslab changes

Modified:
  stable/12/cddl/contrib/opensolaris/cmd/zdb/zdb.c
  stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
  stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
  stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
  stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
  stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c
Directory Properties:
  stable/12/   (props changed)

Modified: stable/12/cddl/contrib/opensolaris/cmd/zdb/zdb.c
==============================================================================
--- stable/12/cddl/contrib/opensolaris/cmd/zdb/zdb.c	Wed Oct 30 16:47:05 2019	(r354186)
+++ stable/12/cddl/contrib/opensolaris/cmd/zdb/zdb.c	Wed Oct 30 16:49:44 2019	(r354187)
@@ -901,11 +901,8 @@ dump_metaslab(metaslab_t *msp)
 
 	if (dump_opt['m'] > 2 && !dump_opt['L']) {
 		mutex_enter(&msp->ms_lock);
-		metaslab_load_wait(msp);
-		if (!msp->ms_loaded) {
-			VERIFY0(metaslab_load(msp));
-			range_tree_stat_verify(msp->ms_allocatable);
-		}
+		VERIFY0(metaslab_load(msp));
+		range_tree_stat_verify(msp->ms_allocatable);
 		dump_metaslab_stats(msp);
 		metaslab_unload(msp);
 		mutex_exit(&msp->ms_lock);

Modified: stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
==============================================================================
--- stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c	Wed Oct 30 16:47:05 2019	(r354186)
+++ stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c	Wed Oct 30 16:49:44 2019	(r354187)
@@ -1467,7 +1467,7 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 /*
  * Wait for any in-progress metaslab loads to complete.
  */
-void
+static void
 metaslab_load_wait(metaslab_t *msp)
 {
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
@@ -1478,20 +1478,17 @@ metaslab_load_wait(metaslab_t *msp)
 	}
 }
 
-int
-metaslab_load(metaslab_t *msp)
+static int
+metaslab_load_impl(metaslab_t *msp)
 {
 	int error = 0;
-	boolean_t success = B_FALSE;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
-	ASSERT(!msp->ms_loaded);
-	ASSERT(!msp->ms_loading);
+	ASSERT(msp->ms_loading);
 
-	msp->ms_loading = B_TRUE;
 	/*
 	 * Nobody else can manipulate a loading metaslab, so it's now safe
-	 * to drop the lock.  This way we don't have to hold the lock while
+	 * to drop the lock. This way we don't have to hold the lock while
 	 * reading the spacemap from disk.
 	 */
 	mutex_exit(&msp->ms_lock);
@@ -1508,29 +1505,49 @@ metaslab_load(metaslab_t *msp)
 		    msp->ms_start, msp->ms_size);
 	}
 
-	success = (error == 0);
-
 	mutex_enter(&msp->ms_lock);
-	msp->ms_loading = B_FALSE;
 
-	if (success) {
-		ASSERT3P(msp->ms_group, !=, NULL);
-		msp->ms_loaded = B_TRUE;
+	if (error != 0)
+		return (error);
 
-		/*
-		 * If the metaslab already has a spacemap, then we need to
-		 * remove all segments from the defer tree; otherwise, the
-		 * metaslab is completely empty and we can skip this.
-		 */
-		if (msp->ms_sm != NULL) {
-			for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-				range_tree_walk(msp->ms_defer[t],
-				    range_tree_remove, msp->ms_allocatable);
-			}
+	ASSERT3P(msp->ms_group, !=, NULL);
+	msp->ms_loaded = B_TRUE;
+
+	/*
+	 * If the metaslab already has a spacemap, then we need to
+	 * remove all segments from the defer tree; otherwise, the
+	 * metaslab is completely empty and we can skip this.
+	 */
+	if (msp->ms_sm != NULL) {
+		for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+			range_tree_walk(msp->ms_defer[t],
+			    range_tree_remove, msp->ms_allocatable);
 		}
-		msp->ms_max_size = metaslab_block_maxsize(msp);
 	}
+	msp->ms_max_size = metaslab_block_maxsize(msp);
+
+	return (0);
+}
+
+int
+metaslab_load(metaslab_t *msp)
+{
+	ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+	/*
+	 * There may be another thread loading the same metaslab, if that's
+	 * the case just wait until the other thread is done and return.
+	 */
+	metaslab_load_wait(msp);
+	if (msp->ms_loaded)
+		return (0);
+	VERIFY(!msp->ms_loading);
+
+	msp->ms_loading = B_TRUE;
+	int error = metaslab_load_impl(msp);
+	msp->ms_loading = B_FALSE;
 	cv_broadcast(&msp->ms_load_cv);
+
 	return (error);
 }
 
@@ -2090,13 +2107,10 @@ metaslab_activate(metaslab_t *msp, int allocator, uint
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
-		int error = 0;
-		metaslab_load_wait(msp);
-		if (!msp->ms_loaded) {
-			if ((error = metaslab_load(msp)) != 0) {
-				metaslab_group_sort(msp->ms_group, msp, 0);
-				return (error);
-			}
+		int error = metaslab_load(msp);
+		if (error != 0) {
+			metaslab_group_sort(msp->ms_group, msp, 0);
+			return (error);
 		}
 		if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
 			/*
@@ -2208,9 +2222,7 @@ metaslab_preload(void *arg)
 	ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
 
 	mutex_enter(&msp->ms_lock);
-	metaslab_load_wait(msp);
-	if (!msp->ms_loaded)
-		(void) metaslab_load(msp);
+	(void) metaslab_load(msp);
 	msp->ms_selected_txg = spa_syncing_txg(spa);
 	mutex_exit(&msp->ms_lock);
 }

Modified: stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
==============================================================================
--- stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h	Wed Oct 30 16:47:05 2019	(r354186)
+++ stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h	Wed Oct 30 16:49:44 2019	(r354187)
@@ -48,7 +48,6 @@ int metaslab_init(metaslab_group_t *, uint64_t, uint64
     metaslab_t **);
 void metaslab_fini(metaslab_t *);
 
-void metaslab_load_wait(metaslab_t *);
 int metaslab_load(metaslab_t *);
 void metaslab_unload(metaslab_t *);
 

Modified: stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
==============================================================================
--- stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h	Wed Oct 30 16:47:05 2019	(r354186)
+++ stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h	Wed Oct 30 16:49:44 2019	(r354187)
@@ -370,8 +370,8 @@ struct metaslab {
 	uint64_t	ms_initializing; /* leaves initializing this ms */
 
 	/*
-	 * We must hold both ms_lock and ms_group->mg_lock in order to
-	 * modify ms_loaded.
+	 * We must always hold the ms_lock when modifying ms_loaded
+	 * and ms_loading.
 	 */
 	boolean_t	ms_loaded;
 	boolean_t	ms_loading;

Modified: stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
==============================================================================
--- stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c	Wed Oct 30 16:47:05 2019	(r354186)
+++ stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c	Wed Oct 30 16:49:44 2019	(r354187)
@@ -163,34 +163,34 @@ static vdev_ops_t *vdev_ops_table[] = {
 };
 
 
-/* target number of metaslabs per top-level vdev */
-int vdev_max_ms_count = 200;
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count, CTLFLAG_RWTUN,
-    &vdev_max_ms_count, 0,
+/* default target for number of metaslabs per top-level vdev */
+int zfs_vdev_default_ms_count = 200;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_count, CTLFLAG_RWTUN,
+    &zfs_vdev_default_ms_count, 0,
     "Target number of metaslabs per top-level vdev");
 
 /* minimum number of metaslabs per top-level vdev */
-int vdev_min_ms_count = 16;
+int zfs_vdev_min_ms_count = 16;
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_ms_count, CTLFLAG_RWTUN,
-    &vdev_min_ms_count, 0,
+    &zfs_vdev_min_ms_count, 0,
     "Minimum number of metaslabs per top-level vdev");
 
 /* practical upper limit of total metaslabs per top-level vdev */
-int vdev_ms_count_limit = 1ULL << 17;
+int zfs_vdev_ms_count_limit = 1ULL << 17;
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count_limit, CTLFLAG_RWTUN,
-    &vdev_ms_count_limit, 0,
+    &zfs_vdev_ms_count_limit, 0,
     "Maximum number of metaslabs per top-level vdev");
 
 /* lower limit for metaslab size (512M) */
-int vdev_default_ms_shift = 29;
+int zfs_vdev_default_ms_shift = 29;
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_shift, CTLFLAG_RWTUN,
-    &vdev_default_ms_shift, 0,
+    &zfs_vdev_default_ms_shift, 0,
     "Default shift between vdev size and number of metaslabs");
 
-/* upper limit for metaslab size (256G) */
-int vdev_max_ms_shift = 38;
+/* upper limit for metaslab size (16G) */
+int zfs_vdev_max_ms_shift = 34;
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_shift, CTLFLAG_RWTUN,
-    &vdev_max_ms_shift, 0,
+    &zfs_vdev_max_ms_shift, 0,
     "Maximum shift between vdev size and number of metaslabs");
 
 boolean_t vdev_validate_skip = B_FALSE;
@@ -2205,16 +2205,24 @@ void
 vdev_metaslab_set_size(vdev_t *vd)
 {
 	uint64_t asize = vd->vdev_asize;
-	uint64_t ms_count = asize >> vdev_default_ms_shift;
+	uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
 	uint64_t ms_shift;
 
 	/*
 	 * There are two dimensions to the metaslab sizing calculation:
 	 * the size of the metaslab and the count of metaslabs per vdev.
-	 * In general, we aim for vdev_max_ms_count (200) metaslabs. The
-	 * range of the dimensions are as follows:
 	 *
-	 *	2^29 <= ms_size  <= 2^38
+	 * The default values used below are a good balance between memory
+	 * usage (larger metaslab size means more memory needed for loaded
+	 * metaslabs; more metaslabs means more memory needed for the
+	 * metaslab_t structs), metaslab load time (larger metaslabs take
+	 * longer to load), and metaslab sync time (more metaslabs means
+	 * more time spent syncing all of them).
+	 *
+	 * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
+	 * The range of the dimensions are as follows:
+	 *
+	 *	2^29 <= ms_size  <= 2^34
 	 *	  16 <= ms_count <= 131,072
 	 *
 	 * On the lower end of vdev sizes, we aim for metaslabs sizes of
@@ -2223,35 +2231,41 @@ vdev_metaslab_set_size(vdev_t *vd)
 	 * of at least 16 metaslabs will override this minimum size goal.
 	 *
 	 * On the upper end of vdev sizes, we aim for a maximum metaslab
-	 * size of 256GB.  However, we will cap the total count to 2^17
-	 * metaslabs to keep our memory footprint in check.
+	 * size of 16GB.  However, we will cap the total count to 2^17
+	 * metaslabs to keep our memory footprint in check and let the
+	 * metaslab size grow from there if that limit is hit.
 	 *
 	 * The net effect of applying above constrains is summarized below.
 	 *
-	 *	vdev size	metaslab count
-	 *	-------------|-----------------
-	 *	< 8GB		~16
-	 *	8GB - 100GB	one per 512MB
-	 *	100GB - 50TB	~200
-	 *	50TB - 32PB	one per 256GB
-	 *	> 32PB		~131,072
-	 *	-------------------------------
+	 *   vdev size       metaslab count
+	 *  --------------|-----------------
+	 *      < 8GB        ~16
+	 *  8GB   - 100GB   one per 512MB
+	 *  100GB - 3TB     ~200
+	 *  3TB   - 2PB     one per 16GB
+	 *      > 2PB       ~131,072
+	 *  --------------------------------
+	 *
+	 *  Finally, note that all of the above calculate the initial
+	 *  number of metaslabs. Expanding a top-level vdev will result
+	 *  in additional metaslabs being allocated making it possible
+	 *  to exceed the zfs_vdev_ms_count_limit.
 	 */
 
-	if (ms_count < vdev_min_ms_count)
-		ms_shift = highbit64(asize / vdev_min_ms_count);
-	else if (ms_count > vdev_max_ms_count)
-		ms_shift = highbit64(asize / vdev_max_ms_count);
+	if (ms_count < zfs_vdev_min_ms_count)
+		ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
+	else if (ms_count > zfs_vdev_default_ms_count)
+		ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
 	else
-		ms_shift = vdev_default_ms_shift;
+		ms_shift = zfs_vdev_default_ms_shift;
 
 	if (ms_shift < SPA_MAXBLOCKSHIFT) {
 		ms_shift = SPA_MAXBLOCKSHIFT;
-	} else if (ms_shift > vdev_max_ms_shift) {
-		ms_shift = vdev_max_ms_shift;
+	} else if (ms_shift > zfs_vdev_max_ms_shift) {
+		ms_shift = zfs_vdev_max_ms_shift;
 		/* cap the total count to constrain memory footprint */
-		if ((asize >> ms_shift) > vdev_ms_count_limit)
-			ms_shift = highbit64(asize / vdev_ms_count_limit);
+		if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
+			ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
 	}
 
 	vd->vdev_ms_shift = ms_shift;
@@ -3611,13 +3625,17 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
 boolean_t
 vdev_is_spacemap_addressable(vdev_t *vd)
 {
+	if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
+		return (B_TRUE);
+
 	/*
-	 * Assuming 47 bits of the space map entry dedicated for the entry's
-	 * offset (see description in space_map.h), we calculate the maximum
-	 * address that can be described by a space map entry for the given
-	 * device.
+	 * If double-word space map entries are not enabled we assume
+	 * 47 bits of the space map entry are dedicated to the entry's
+	 * offset (see SM_OFFSET_BITS in space_map.h). We then use that
+	 * to calculate the maximum address that can be described by a
+	 * space map entry for the given device.
 	 */
-	uint64_t shift = vd->vdev_ashift + 47;
+	uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
 
 	if (shift >= 63) /* detect potential overflow */
 		return (B_TRUE);

Modified: stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c
==============================================================================
--- stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c	Wed Oct 30 16:47:05 2019	(r354186)
+++ stable/12/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c	Wed Oct 30 16:49:44 2019	(r354187)
@@ -353,16 +353,6 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data)
 }
 
 static void
-vdev_initialize_ms_load(metaslab_t *msp)
-{
-	ASSERT(MUTEX_HELD(&msp->ms_lock));
-
-	metaslab_load_wait(msp);
-	if (!msp->ms_loaded)
-		VERIFY0(metaslab_load(msp));
-}
-
-static void
 vdev_initialize_mg_wait(metaslab_group_t *mg)
 {
 	ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
@@ -484,10 +474,10 @@ vdev_initialize_calculate_progress(vdev_t *vd)
 		 * metaslab. Load it and walk the free tree for more accurate
 		 * progress estimation.
 		 */
-		vdev_initialize_ms_load(msp);
+		VERIFY0(metaslab_load(msp));
 
-		for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root); rs;
-		    rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
+		for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
+		    rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
 			logical_rs.rs_start = rs->rs_start;
 			logical_rs.rs_end = rs->rs_end;
 			vdev_xlate(vd, &logical_rs, &physical_rs);
@@ -615,7 +605,7 @@ vdev_initialize_thread(void *arg)
 
 		vdev_initialize_ms_mark(msp);
 		mutex_enter(&msp->ms_lock);
-		vdev_initialize_ms_load(msp);
+		VERIFY0(metaslab_load(msp));
 
 		range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
 		    vd);



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201910301649.x9UGni55041212>