Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 21 Nov 2019 14:10:53 +0000 (UTC)
From:      Andriy Gapon <avg@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org
Subject:   svn commit: r354961 - vendor-sys/illumos/dist/common/zfs vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/sys vendor-sys/illumos/dist/uts/common/sys/fs vendor/ill...
Message-ID:  <201911211410.xALEArvv050085@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: avg
Date: Thu Nov 21 14:10:53 2019
New Revision: 354961
URL: https://svnweb.freebsd.org/changeset/base/354961

Log:
  10952 defer new resilvers and misc. resilver-related fixes
  
  illumos/illumos-gate@e4c795beb33bf59dd4ad2e3f88f493111484b890
  https://github.com/illumos/illumos-gate/commit/e4c795beb33bf59dd4ad2e3f88f493111484b890
  
  https://www.illumos.org/issues/10952
    From ZoL
    612c4930dd2 Fix the spelling of deferred ???
    cef48f14da6 Remove races from scrub / resilver tests
    4021ba4cfaa Make vdev_set_deferred_resilver() recursive
    8cb119e3dc0 Fix 2 small bugs with cached dsl_scan_phys_t
    5e0bd0ae056 Fix issue with scanning dedup blocks as scan ends
    b3d7725c943 Remove zfs_gitrev.h (this shouldn't be part of 80a91e74696)
    80a91e74696 Defer new resilvers until the current one ends
  
  Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
  Portions contributed by: Brian Behlendorf <behlendorf1@llnl.gov>
  Portions contributed by: Arkadiusz BubaƂa <arkadiusz.bubala@open-e.com>
  Author: Tom Caputi <tcaputi@datto.com>

Modified:
  vendor-sys/illumos/dist/common/zfs/zfeature_common.c
  vendor-sys/illumos/dist/common/zfs/zfeature_common.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_label.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_removal.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zil.c
  vendor-sys/illumos/dist/uts/common/sys/fs/zfs.h

Changes in other areas also in this revision:
Modified:
  vendor/illumos/dist/cmd/zpool/zpool_main.c
  vendor/illumos/dist/lib/libzfs/common/libzfs.h
  vendor/illumos/dist/lib/libzfs/common/libzfs_pool.c
  vendor/illumos/dist/lib/libzfs/common/libzfs_util.c
  vendor/illumos/dist/man/man1m/zpool.1m
  vendor/illumos/dist/man/man5/zpool-features.5

Modified: vendor-sys/illumos/dist/common/zfs/zfeature_common.c
==============================================================================
--- vendor-sys/illumos/dist/common/zfs/zfeature_common.c	Thu Nov 21 14:09:46 2019	(r354960)
+++ vendor-sys/illumos/dist/common/zfs/zfeature_common.c	Thu Nov 21 14:10:53 2019	(r354961)
@@ -300,10 +300,13 @@ zpool_feature_init(void)
 	    "freed or remapped.",
 	    ZFEATURE_FLAG_READONLY_COMPAT, obsolete_counts_deps);
 
-	{
 	zfeature_register(SPA_FEATURE_ALLOCATION_CLASSES,
 	    "org.zfsonlinux:allocation_classes", "allocation_classes",
 	    "Support for separate allocation classes.",
 	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
-	}
+
+	zfeature_register(SPA_FEATURE_RESILVER_DEFER,
+	    "com.datto:resilver_defer", "resilver_defer",
+	    "Support for defering new resilvers when one is already running.",
+	    ZFEATURE_FLAG_READONLY_COMPAT, NULL);
 }

Modified: vendor-sys/illumos/dist/common/zfs/zfeature_common.h
==============================================================================
--- vendor-sys/illumos/dist/common/zfs/zfeature_common.h	Thu Nov 21 14:09:46 2019	(r354960)
+++ vendor-sys/illumos/dist/common/zfs/zfeature_common.h	Thu Nov 21 14:10:53 2019	(r354961)
@@ -63,6 +63,7 @@ typedef enum spa_feature {
 	SPA_FEATURE_POOL_CHECKPOINT,
 	SPA_FEATURE_SPACEMAP_V2,
 	SPA_FEATURE_ALLOCATION_CLASSES,
+	SPA_FEATURE_RESILVER_DEFER,
 	SPA_FEATURES
 } spa_feature_t;
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c	Thu Nov 21 14:09:46 2019	(r354960)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c	Thu Nov 21 14:10:53 2019	(r354961)
@@ -183,12 +183,15 @@ unsigned int zfs_free_min_time_ms = 1000; /* min milli
 unsigned int zfs_obsolete_min_time_ms = 500;
 /* min millisecs to resilver per txg */
 unsigned int zfs_resilver_min_time_ms = 3000;
+int zfs_scan_suspend_progress = 0; /* set to prevent scans from progressing */
 boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
 boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
 enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
 /* max number of blocks to free in a single TXG */
 uint64_t zfs_async_block_max_blocks = UINT64_MAX;
 
+int zfs_resilver_disable_defer = 0; /* set to disable resilver deferring */
+
 /*
  * We wait a few txgs after importing a pool to begin scanning so that
  * the import / mounting code isn't held up by scrub / resilver IO.
@@ -455,7 +458,6 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 	scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
 	    SPA_FEATURE_ASYNC_DESTROY);
 
-	bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
 	avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
 	    offsetof(scan_ds_t, sds_node));
 	avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
@@ -513,6 +515,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 		}
 	}
 
+	bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
+
 	/* reload the queue into the in-core state */
 	if (scn->scn_phys.scn_queue_obj != 0) {
 		zap_cursor_t zc;
@@ -751,6 +755,11 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
 	spa->spa_scrub_reopen = B_FALSE;
 	(void) spa_vdev_state_exit(spa, NULL, 0);
 
+	if (func == POOL_SCAN_RESILVER) {
+		dsl_resilver_restart(spa->spa_dsl_pool, 0);
+		return (0);
+	}
+
 	if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
 		/* got scrub start cmd, resume paused scrub */
 		int err = dsl_scrub_set_pause_resume(scn->scn_dp,
@@ -766,6 +775,41 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
 	    dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
+/*
+ * Sets the resilver defer flag to B_FALSE on all leaf devs under vd. Returns
+ * B_TRUE if we have devices that need to be resilvered and are available to
+ * accept resilver I/Os.
+ */
+static boolean_t
+dsl_scan_clear_deferred(vdev_t *vd, dmu_tx_t *tx)
+{
+	boolean_t resilver_needed = B_FALSE;
+	spa_t *spa = vd->vdev_spa;
+
+	for (int c = 0; c < vd->vdev_children; c++) {
+		resilver_needed |=
+		    dsl_scan_clear_deferred(vd->vdev_child[c], tx);
+	}
+
+	if (vd == spa->spa_root_vdev &&
+	    spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) {
+		spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
+		vdev_config_dirty(vd);
+		spa->spa_resilver_deferred = B_FALSE;
+		return (resilver_needed);
+	}
+
+	if (!vdev_is_concrete(vd) || vd->vdev_aux ||
+	    !vd->vdev_ops->vdev_op_leaf)
+		return (resilver_needed);
+
+	if (vd->vdev_resilver_deferred)
+		vd->vdev_resilver_deferred = B_FALSE;
+
+	return (!vdev_is_dead(vd) && !vd->vdev_offline &&
+	    vdev_resilver_needed(vd, NULL, NULL));
+}
+
 /* ARGSUSED */
 static void
 dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
@@ -865,6 +909,25 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu
 		 * Let the async thread assess this and handle the detach.
 		 */
 		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+
+		/*
+		 * Clear any deferred_resilver flags in the config.
+		 * If there are drives that need resilvering, kick
+		 * off an asynchronous request to start resilver.
+		 * dsl_scan_clear_deferred() may update the config
+		 * before the resilver can restart. In the event of
+		 * a crash during this period, the spa loading code
+		 * will find the drives that need to be resilvered
+		 * when the machine reboots and start the resilver then.
+		 */
+		boolean_t resilver_needed =
+		    dsl_scan_clear_deferred(spa->spa_root_vdev, tx);
+		if (resilver_needed) {
+			spa_history_log_internal(spa,
+			    "starting deferred resilver", tx,
+			    "errors=%llu", spa_get_errlog_size(spa));
+			spa_async_request(spa, SPA_ASYNC_RESILVER);
+		}
 	}
 
 	scn->scn_phys.scn_end_time = gethrestime_sec();
@@ -935,6 +998,7 @@ dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
 		/* can't pause a scrub when there is no in-progress scrub */
 		spa->spa_scan_pass_scrub_pause = gethrestime_sec();
 		scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED;
+		scn->scn_phys_cached.scn_flags |= DSF_SCRUB_PAUSED;
 		dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED);
 	} else {
@@ -949,6 +1013,7 @@ dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
 			    gethrestime_sec() - spa->spa_scan_pass_scrub_pause;
 			spa->spa_scan_pass_scrub_pause = 0;
 			scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
+			scn->scn_phys_cached.scn_flags &= ~DSF_SCRUB_PAUSED;
 			dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 		}
 	}
@@ -2335,6 +2400,20 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum 
 	if (scn->scn_phys.scn_state != DSS_SCANNING)
 		return;
 
+	/*
+	 * This function is special because it is the only thing
+	 * that can add scan_io_t's to the vdev scan queues from
+	 * outside dsl_scan_sync(). For the most part this is ok
+	 * as long as it is called from within syncing context.
+	 * However, dsl_scan_sync() expects that no new sio's will
+	 * be added between when all the work for a scan is done
+	 * and the next txg when the scan is actually marked as
+	 * completed. This check ensures we do not issue new sio's
+	 * during this period.
+	 */
+	if (scn->scn_done_txg != 0)
+		return;
+
 	for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
 		if (ddp->ddp_phys_birth == 0 ||
 		    ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
@@ -2986,6 +3065,26 @@ dsl_scan_active(dsl_scan_t *scn)
 }
 
 static boolean_t
+dsl_scan_check_deferred(vdev_t *vd)
+{
+	boolean_t need_resilver = B_FALSE;
+
+	for (int c = 0; c < vd->vdev_children; c++) {
+		need_resilver |=
+		    dsl_scan_check_deferred(vd->vdev_child[c]);
+	}
+
+	if (!vdev_is_concrete(vd) || vd->vdev_aux ||
+	    !vd->vdev_ops->vdev_op_leaf)
+		return (need_resilver);
+
+	if (!vd->vdev_resilver_deferred)
+		need_resilver = B_TRUE;
+
+	return (need_resilver);
+}
+
+static boolean_t
 dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
     uint64_t phys_birth)
 {
@@ -3032,6 +3131,13 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, s
 	if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize))
 		return (B_FALSE);
 
+	/*
+	 * Check that this top-level vdev has a device under it which
+	 * is resilvering and is not deferred.
+	 */
+	if (!dsl_scan_check_deferred(vd))
+		return (B_FALSE);
+
 	return (B_TRUE);
 }
 
@@ -3193,12 +3299,19 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 	int err = 0;
 	state_sync_type_t sync_type = SYNC_OPTIONAL;
 
+	if (spa->spa_resilver_deferred &&
+	    !spa_feature_is_active(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))
+		spa_feature_incr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
+
 	/*
 	 * Check for scn_restart_txg before checking spa_load_state, so
 	 * that we can restart an old-style scan while the pool is being
-	 * imported (see dsl_scan_init).
+	 * imported (see dsl_scan_init). We also restart scans if there
+	 * is a deferred resilver and the user has manually disabled
+	 * deferred resilvers via the tunable.
 	 */
-	if (dsl_scan_restarting(scn, tx)) {
+	if (dsl_scan_restarting(scn, tx) ||
+	    (spa->spa_resilver_deferred && zfs_resilver_disable_defer)) {
 		pool_scan_func_t func = POOL_SCAN_SCRUB;
 		dsl_scan_done(scn, B_FALSE, tx);
 		if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
@@ -3265,6 +3378,27 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 		return;
 
 	/*
+	 * zfs_scan_suspend_progress can be set to disable scan progress.
+	 * We don't want to spin the txg_sync thread, so we add a delay
+	 * here to simulate the time spent doing a scan. This is mostly
+	 * useful for testing and debugging.
+	 */
+	if (zfs_scan_suspend_progress) {
+		uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time;
+		int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+		    zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
+
+		while (zfs_scan_suspend_progress &&
+		    !txg_sync_waiting(scn->scn_dp) &&
+		    !spa_shutting_down(scn->scn_dp->dp_spa) &&
+		    NSEC2MSEC(scan_time_ns) < mintime) {
+			delay(hz);
+			scan_time_ns = gethrtime() - scn->scn_sync_start_time;
+		}
+		return;
+	}
+
+	/*
 	 * It is possible to switch from unsorted to sorted at any time,
 	 * but afterwards the scan will remain sorted unless reloaded from
 	 * a checkpoint after a reboot.
@@ -3393,6 +3527,8 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 			    (longlong_t)tx->tx_txg);
 		}
 	} else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) {
+		ASSERT(scn->scn_clearing);
+
 		/* need to issue scrubbing IOs from per-vdev queues */
 		scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
 		    NULL, ZIO_FLAG_CANFAIL);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c	Thu Nov 21 14:09:46 2019	(r354960)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c	Thu Nov 21 14:10:53 2019	(r354961)
@@ -6176,9 +6176,14 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *n
 	/*
 	 * Schedule the resilver to restart in the future. We do this to
 	 * ensure that dmu_sync-ed blocks have been stitched into the
-	 * respective datasets.
+	 * respective datasets. We do not do this if resilvers have been
+	 * deferred.
 	 */
-	dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
+	if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
+	    spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
+		vdev_set_deferred_resilver(spa, newvd);
+	else
+		dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
 
 	if (spa->spa_bootfs)
 		spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
@@ -7069,6 +7074,10 @@ spa_scan(spa_t *spa, pool_scan_func_t func)
 	if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
 		return (SET_ERROR(ENOTSUP));
 
+	if (func == POOL_SCAN_RESILVER &&
+	    !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
+		return (SET_ERROR(ENOTSUP));
+
 	/*
 	 * If a resilver was requested, but there is no DTL on a
 	 * writeable leaf device, we have nothing to do.
@@ -7160,6 +7169,7 @@ static void
 spa_async_thread(void *arg)
 {
 	spa_t *spa = (spa_t *)arg;
+	dsl_pool_t *dp = spa->spa_dsl_pool;
 	int tasks;
 
 	ASSERT(spa->spa_sync_on);
@@ -7235,8 +7245,10 @@ spa_async_thread(void *arg)
 	/*
 	 * Kick off a resilver.
 	 */
-	if (tasks & SPA_ASYNC_RESILVER)
-		dsl_resilver_restart(spa->spa_dsl_pool, 0);
+	if (tasks & SPA_ASYNC_RESILVER &&
+	    (!dsl_scan_resilvering(dp) ||
+	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)))
+		dsl_resilver_restart(dp, 0);
 
 	if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
 		mutex_enter(&spa_namespace_lock);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa_impl.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa_impl.h	Thu Nov 21 14:09:46 2019	(r354960)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa_impl.h	Thu Nov 21 14:10:53 2019	(r354961)
@@ -279,6 +279,13 @@ struct spa {
 	uint64_t	spa_scan_pass_scrub_spent_paused; /* total paused */
 	uint64_t	spa_scan_pass_exam;	/* examined bytes per pass */
 	uint64_t	spa_scan_pass_issued;	/* issued bytes per pass */
+
+	/*
+	 * We are in the middle of a resilver, and another resilver
+	 * is needed once this one completes. This is set iff any
+	 * vdev_resilver_deferred is set.
+	 */
+	boolean_t	spa_resilver_deferred;
 	kmutex_t	spa_async_lock;		/* protect async state */
 	kthread_t	*spa_async_thread;	/* thread doing async task */
 	int		spa_async_suspended;	/* async tasks suspended */

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev.h	Thu Nov 21 14:09:46 2019	(r354960)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev.h	Thu Nov 21 14:10:53 2019	(r354961)
@@ -149,6 +149,8 @@ extern int vdev_config_sync(vdev_t **svd, int svdcount
 extern void vdev_state_dirty(vdev_t *vd);
 extern void vdev_state_clean(vdev_t *vd);
 
+extern void vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd);
+
 typedef enum vdev_config_flag {
 	VDEV_CONFIG_SPARE = 1 << 0,
 	VDEV_CONFIG_L2CACHE = 1 << 1,

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h	Thu Nov 21 14:09:46 2019	(r354960)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h	Thu Nov 21 14:10:53 2019	(r354961)
@@ -346,6 +346,7 @@ struct vdev {
 	boolean_t	vdev_cant_write; /* vdev is failing all writes	*/
 	boolean_t	vdev_isspare;	/* was a hot spare		*/
 	boolean_t	vdev_isl2cache;	/* was a l2cache device		*/
+	boolean_t	vdev_resilver_deferred;  /* resilver deferred */
 	vdev_queue_t	vdev_queue;	/* I/O deadline schedule queue	*/
 	vdev_cache_t	vdev_cache;	/* physical block cache		*/
 	spa_aux_vdev_t	*vdev_aux;	/* for l2cache and spares vdevs	*/

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c	Thu Nov 21 14:09:46 2019	(r354960)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c	Thu Nov 21 14:10:53 2019	(r354961)
@@ -760,6 +760,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vde
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
 		    &vd->vdev_resilver_txg);
 
+		if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER))
+			vdev_set_deferred_resilver(spa, vd);
+
 		/*
 		 * When importing a pool, we want to ignore the persistent fault
 		 * state, as the diagnosis made on another system may not be
@@ -1733,8 +1736,13 @@ vdev_open(vdev_t *vd)
 	 * since this would just restart the scrub we are already doing.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
-	    vdev_resilver_needed(vd, NULL, NULL))
-		spa_async_request(spa, SPA_ASYNC_RESILVER);
+	    vdev_resilver_needed(vd, NULL, NULL)) {
+		if (dsl_scan_resilvering(spa->spa_dsl_pool) &&
+		    spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
+			vdev_set_deferred_resilver(spa, vd);
+		else
+			spa_async_request(spa, SPA_ASYNC_RESILVER);
+	}
 
 	return (0);
 }
@@ -2441,6 +2449,9 @@ vdev_dtl_should_excise(vdev_t *vd)
 	if (vd->vdev_state < VDEV_STATE_DEGRADED)
 		return (B_FALSE);
 
+	if (vd->vdev_resilver_deferred)
+		return (B_FALSE);
+
 	if (vd->vdev_resilver_txg == 0 ||
 	    range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
 		return (B_TRUE);
@@ -3474,8 +3485,14 @@ vdev_clear(spa_t *spa, vdev_t *vd)
 		if (vd != rvd && vdev_writeable(vd->vdev_top))
 			vdev_state_dirty(vd->vdev_top);
 
-		if (vd->vdev_aux == NULL && !vdev_is_dead(vd))
-			spa_async_request(spa, SPA_ASYNC_RESILVER);
+		if (vd->vdev_aux == NULL && !vdev_is_dead(vd)) {
+			if (dsl_scan_resilvering(spa->spa_dsl_pool) &&
+			    spa_feature_is_enabled(spa,
+			    SPA_FEATURE_RESILVER_DEFER))
+				vdev_set_deferred_resilver(spa, vd);
+			else
+				spa_async_request(spa, SPA_ASYNC_RESILVER);
+		}
 
 		spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR);
 	}
@@ -3618,6 +3635,8 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
 		vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
 		    vd->vdev_mg->mg_fragmentation : 0;
 	}
+	if (vd->vdev_ops->vdev_op_leaf)
+		vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
 
 	/*
 	 * If we're getting stats on the root vdev, aggregate the I/O counts
@@ -4330,4 +4349,19 @@ vdev_deadman(vdev_t *vd)
 		}
 		mutex_exit(&vq->vq_lock);
 	}
+}
+
+void
+vdev_set_deferred_resilver(spa_t *spa, vdev_t *vd)
+{
+	for (uint64_t i = 0; i < vd->vdev_children; i++)
+		vdev_set_deferred_resilver(spa, vd->vdev_child[i]);
+
+	if (!vd->vdev_ops->vdev_op_leaf || !vdev_writeable(vd) ||
+	    range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
+		return;
+	}
+
+	vd->vdev_resilver_deferred = B_TRUE;
+	spa->spa_resilver_deferred = B_TRUE;
 }

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c	Thu Nov 21 14:09:46 2019	(r354960)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c	Thu Nov 21 14:10:53 2019	(r354961)
@@ -1239,6 +1239,8 @@ vdev_indirect_read_all(zio_t *zio)
 {
 	indirect_vsd_t *iv = zio->io_vsd;
 
+	ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
+
 	for (indirect_split_t *is = list_head(&iv->iv_splits);
 	    is != NULL; is = list_next(&iv->iv_splits, is)) {
 		for (int i = 0; i < is->is_children; i++) {
@@ -1321,7 +1323,8 @@ vdev_indirect_io_start(zio_t *zio)
 		    vdev_indirect_child_io_done, zio));
 	} else {
 		iv->iv_split_block = B_TRUE;
-		if (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
+		if (zio->io_type == ZIO_TYPE_READ &&
+		    zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) {
 			/*
 			 * Read all copies.  Note that for simplicity,
 			 * we don't bother consulting the DTL in the
@@ -1330,13 +1333,17 @@ vdev_indirect_io_start(zio_t *zio)
 			vdev_indirect_read_all(zio);
 		} else {
 			/*
-			 * Read one copy of each split segment, from the
-			 * top-level vdev.  Since we don't know the
-			 * checksum of each split individually, the child
-			 * zio can't ensure that we get the right data.
-			 * E.g. if it's a mirror, it will just read from a
-			 * random (healthy) leaf vdev.  We have to verify
-			 * the checksum in vdev_indirect_io_done().
+			 * If this is a read zio, we read one copy of each
+			 * split segment, from the top-level vdev.  Since
+			 * we don't know the checksum of each split
+			 * individually, the child zio can't ensure that
+			 * we get the right data. E.g. if it's a mirror,
+			 * it will just read from a random (healthy) leaf
+			 * vdev. We have to verify the checksum in
+			 * vdev_indirect_io_done().
+			 *
+			 * For write zios, the vdev code will ensure we write
+			 * to all children.
 			 */
 			for (indirect_split_t *is = list_head(&iv->iv_splits);
 			    is != NULL; is = list_next(&iv->iv_splits, is)) {

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_label.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_label.c	Thu Nov 21 14:09:46 2019	(r354960)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_label.c	Thu Nov 21 14:10:53 2019	(r354961)
@@ -377,6 +377,12 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP,
 			    vd->vdev_top_zap);
 		}
+
+		if (vd->vdev_resilver_deferred) {
+			ASSERT(vd->vdev_ops->vdev_op_leaf);
+			ASSERT(spa->spa_resilver_deferred);
+			fnvlist_add_boolean(nv, ZPOOL_CONFIG_RESILVER_DEFER);
+		}
 	}
 
 	if (getstats) {

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_removal.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_removal.c	Thu Nov 21 14:09:46 2019	(r354960)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_removal.c	Thu Nov 21 14:10:53 2019	(r354961)
@@ -127,7 +127,7 @@ int vdev_removal_max_span = 32 * 1024;
  * This is used by the test suite so that it can ensure that certain
  * actions happen while in the middle of a removal.
  */
-uint64_t zfs_remove_max_bytes_pause = UINT64_MAX;
+int zfs_removal_suspend_progress = 0;
 
 #define	VDEV_REMOVAL_ZAP_OBJS	"lzap"
 
@@ -1433,14 +1433,14 @@ spa_vdev_remove_thread(void *arg)
 
 			/*
 			 * This delay will pause the removal around the point
-			 * specified by zfs_remove_max_bytes_pause. We do this
+			 * specified by zfs_removal_suspend_progress. We do this
 			 * solely from the test suite or during debugging.
 			 */
 			uint64_t bytes_copied =
 			    spa->spa_removing_phys.sr_copied;
 			for (int i = 0; i < TXG_SIZE; i++)
 				bytes_copied += svr->svr_bytes_done[i];
-			while (zfs_remove_max_bytes_pause <= bytes_copied &&
+			while (zfs_removal_suspend_progress &&
 			    !svr->svr_thread_exit)
 				delay(hz);
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/zil.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/zil.c	Thu Nov 21 14:09:46 2019	(r354960)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/zil.c	Thu Nov 21 14:10:53 2019	(r354961)
@@ -1252,7 +1252,7 @@ zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb)
 		 * root zios). This is required because of how we can
 		 * defer the DKIOCFLUSHWRITECACHE commands for each lwb.
 		 *
-		 * When the DKIOCFLUSHWRITECACHE commands are defered,
+		 * When the DKIOCFLUSHWRITECACHE commands are deferred,
 		 * the previous lwb will rely on this lwb to flush the
 		 * vdevs written to by that previous lwb. Thus, we need
 		 * to ensure this lwb doesn't issue the flush until

Modified: vendor-sys/illumos/dist/uts/common/sys/fs/zfs.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/sys/fs/zfs.h	Thu Nov 21 14:09:46 2019	(r354960)
+++ vendor-sys/illumos/dist/uts/common/sys/fs/zfs.h	Thu Nov 21 14:10:53 2019	(r354961)
@@ -597,6 +597,7 @@ typedef struct zpool_load_policy {
 #define	ZPOOL_CONFIG_VDEV_TOP_ZAP	"com.delphix:vdev_zap_top"
 #define	ZPOOL_CONFIG_VDEV_LEAF_ZAP	"com.delphix:vdev_zap_leaf"
 #define	ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS	"com.delphix:has_per_vdev_zaps"
+#define	ZPOOL_CONFIG_RESILVER_DEFER	"com.datto:resilver_defer"
 #define	ZPOOL_CONFIG_CACHEFILE		"cachefile"	/* not stored on disk */
 #define	ZPOOL_CONFIG_MMP_STATE		"mmp_state"	/* not stored on disk */
 #define	ZPOOL_CONFIG_MMP_TXG		"mmp_txg"	/* not stored on disk */
@@ -896,6 +897,7 @@ typedef struct vdev_stat {
 	uint64_t	vs_initialize_state;	/* vdev_initialzing_state_t */
 	uint64_t	vs_initialize_action_time; /* time_t */
 	uint64_t	vs_checkpoint_space;    /* checkpoint-consumed space */
+	uint64_t	vs_resilver_deferred;	/* resilver deferred	*/
 } vdev_stat_t;
 
 /*



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201911211410.xALEArvv050085>