Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 31 Jul 2018 18:49:07 +0000 (UTC)
From:      Alexander Motin <mav@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org
Subject:   svn commit: r336991 - vendor-sys/illumos/dist/uts/common vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/sys vendor-sys/illumos/dist/uts/common/sys/fs vendor/ill...
Message-ID:  <201807311849.w6VIn7Xt065532@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mav
Date: Tue Jul 31 18:49:07 2018
New Revision: 336991
URL: https://svnweb.freebsd.org/changeset/base/336991

Log:
  9102 zfs should be able to initialize storage devices
  
  The first access to a disk block can incur a performance penalty on some
  platforms (e.g. AWS's EBS, VMware VMDKs). Therefore it is recommended that
  volumes be "thick provisioned", where supported by the platform (VMware).
  Thick provisioning is time consuming and often is ignored. If the thick
  provision step is omitted, customers will see suboptimal performance until
  we have written to all parts of the LUN. ZFS should be able to initialize
  any unused storage to remove any first-write penalty that exists.
  
  illumos/illumos-gate@094e47e980b0796b94b1b8f51f462a64d246e516
  
  Reviewed by: John Wren Kennedy <john.kennedy@delphix.com>
  Reviewed by: Matthew Ahrens <mahrens@delphix.com>
  Reviewed by: Pavel Zakharov <pavel.zakharov@delphix.com>
  Reviewed by: Prakash Surya <prakash.surya@delphix.com>
  Approved by: Richard Lowe <richlowe@richlowe.net>

Modified:
  vendor-sys/illumos/dist/uts/common/Makefile.files
  vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio_priority.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_disk.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_file.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_mirror.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_missing.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_queue.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_raidz.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_removal.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_root.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ioctl.c
  vendor-sys/illumos/dist/uts/common/sys/fs/zfs.h

Changes in other areas also in this revision:
Modified:
  vendor/illumos/dist/cmd/zpool/zpool_main.c
  vendor/illumos/dist/cmd/ztest/ztest.c
  vendor/illumos/dist/lib/libzfs/common/libzfs.h
  vendor/illumos/dist/lib/libzfs/common/libzfs_pool.c
  vendor/illumos/dist/lib/libzfs/common/libzfs_util.c
  vendor/illumos/dist/lib/libzfs_core/common/libzfs_core.c
  vendor/illumos/dist/lib/libzfs_core/common/libzfs_core.h
  vendor/illumos/dist/man/man1m/zpool.1m

Modified: vendor-sys/illumos/dist/uts/common/Makefile.files
==============================================================================
--- vendor-sys/illumos/dist/uts/common/Makefile.files	Tue Jul 31 18:32:57 2018	(r336990)
+++ vendor-sys/illumos/dist/uts/common/Makefile.files	Tue Jul 31 18:49:07 2018	(r336991)
@@ -1398,6 +1398,7 @@ ZFS_COMMON_OBJS +=		\
 	vdev.o			\
 	vdev_cache.o		\
 	vdev_file.o		\
+	vdev_initialize.o	\
 	vdev_label.o		\
 	vdev_mirror.o		\
 	vdev_missing.o		\

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c	Tue Jul 31 18:32:57 2018	(r336990)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c	Tue Jul 31 18:49:07 2018	(r336991)
@@ -635,6 +635,8 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd
 
 	mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP);
 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&mg->mg_ms_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&mg->mg_ms_initialize_cv, NULL, CV_DEFAULT, NULL);
 	mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
 	    KM_SLEEP);
 	mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
@@ -681,6 +683,8 @@ metaslab_group_destroy(metaslab_group_t *mg)
 	kmem_free(mg->mg_secondaries, mg->mg_allocators *
 	    sizeof (metaslab_t *));
 	mutex_destroy(&mg->mg_lock);
+	mutex_destroy(&mg->mg_ms_initialize_lock);
+	cv_destroy(&mg->mg_ms_initialize_cv);
 
 	for (int i = 0; i < mg->mg_allocators; i++) {
 		refcount_destroy(&mg->mg_alloc_queue_depth[i]);
@@ -1541,6 +1545,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint6
 	mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
+
 	ms->ms_id = id;
 	ms->ms_start = id << vd->vdev_ms_shift;
 	ms->ms_size = 1ULL << vd->vdev_ms_shift;
@@ -2717,6 +2722,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	 * from it in 'metaslab_unload_delay' txgs, then unload it.
 	 */
 	if (msp->ms_loaded &&
+	    msp->ms_initializing == 0 &&
 	    msp->ms_selected_txg + metaslab_unload_delay < txg) {
 		for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
 			VERIFY0(range_tree_space(
@@ -2966,6 +2972,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, u
 	metaslab_class_t *mc = msp->ms_group->mg_class;
 
 	VERIFY(!msp->ms_condensing);
+	VERIFY0(msp->ms_initializing);
 
 	start = mc->mc_ops->msop_alloc(msp, size);
 	if (start != -1ULL) {
@@ -3026,9 +3033,10 @@ find_valid_metaslab(metaslab_group_t *mg, uint64_t act
 		}
 
 		/*
-		 * If the selected metaslab is condensing, skip it.
+			 * If the selected metaslab is condensing or being
+			 * initialized, skip it.
 		 */
-		if (msp->ms_condensing)
+			if (msp->ms_condensing || msp->ms_initializing > 0)
 			continue;
 
 		*was_active = msp->ms_allocator != -1;
@@ -3193,11 +3201,20 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_
 		/*
 		 * If this metaslab is currently condensing then pick again as
 		 * we can't manipulate this metaslab until it's committed
-		 * to disk.
+		 * to disk. If this metaslab is being initialized, we shouldn't
+		 * allocate from it since the allocated region might be
+		 * overwritten after allocation.
 		 */
 		if (msp->ms_condensing) {
 			metaslab_trace_add(zal, mg, msp, asize, d,
 			    TRACE_CONDENSING, allocator);
+			metaslab_passivate(msp, msp->ms_weight &
+			    ~METASLAB_ACTIVE_MASK);
+			mutex_exit(&msp->ms_lock);
+			continue;
+		} else if (msp->ms_initializing > 0) {
+			metaslab_trace_add(zal, mg, msp, asize, d,
+			    TRACE_INITIALIZING, allocator);
 			metaslab_passivate(msp, msp->ms_weight &
 			    ~METASLAB_ACTIVE_MASK);
 			mutex_exit(&msp->ms_lock);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c	Tue Jul 31 18:32:57 2018	(r336990)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c	Tue Jul 31 18:49:07 2018	(r336991)
@@ -54,6 +54,7 @@
 #include <sys/vdev_removal.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/vdev_indirect_births.h>
+#include <sys/vdev_initialize.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/uberblock_impl.h>
@@ -413,8 +414,9 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
 
 				dp = spa_get_dsl(spa);
 				dsl_pool_config_enter(dp, FTAG);
-				if (err = dsl_dataset_hold_obj(dp,
-				    za.za_first_integer, FTAG, &ds)) {
+				err = dsl_dataset_hold_obj(dp,
+				    za.za_first_integer, FTAG, &ds);
+				if (err != 0) {
 					dsl_pool_config_exit(dp, FTAG);
 					break;
 				}
@@ -569,7 +571,8 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
 					break;
 				}
 
-				if (error = dmu_objset_hold(strval, FTAG, &os))
+				error = dmu_objset_hold(strval, FTAG, &os);
+				if (error != 0)
 					break;
 
 				/*
@@ -1155,8 +1158,10 @@ spa_activate(spa_t *spa, int mode)
 		spa_create_zio_taskqs(spa);
 	}
 
-	for (size_t i = 0; i < TXG_SIZE; i++)
-		spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, 0);
+	for (size_t i = 0; i < TXG_SIZE; i++) {
+		spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
+		    ZIO_FLAG_CANFAIL);
+	}
 
 	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_config_dirty_node));
@@ -1315,6 +1320,11 @@ spa_unload(spa_t *spa)
 	 */
 	spa_async_suspend(spa);
 
+	if (spa->spa_root_vdev) {
+		vdev_initialize_stop_all(spa->spa_root_vdev,
+		    VDEV_INITIALIZE_ACTIVE);
+	}
+
 	/*
 	 * Stop syncing.
 	 */
@@ -1330,10 +1340,10 @@ spa_unload(spa_t *spa)
 	 * calling taskq_wait(mg_taskq).
 	 */
 	if (spa->spa_root_vdev != NULL) {
-		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 		for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++)
 			vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]);
-		spa_config_exit(spa, SCL_ALL, FTAG);
+		spa_config_exit(spa, SCL_ALL, spa);
 	}
 
 	/*
@@ -1367,7 +1377,7 @@ spa_unload(spa_t *spa)
 
 	bpobj_close(&spa->spa_deferred_bpobj);
 
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 
 	/*
 	 * Close all vdevs.
@@ -1429,7 +1439,7 @@ spa_unload(spa_t *spa)
 		spa->spa_comment = NULL;
 	}
 
-	spa_config_exit(spa, SCL_ALL, FTAG);
+	spa_config_exit(spa, SCL_ALL, spa);
 }
 
 /*
@@ -3866,6 +3876,10 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char
 		spa_restart_removal(spa);
 
 		spa_spawn_aux_threads(spa);
+
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		vdev_initialize_restart(spa->spa_root_vdev);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 
 	spa_load_note(spa, "LOADED");
@@ -5347,6 +5361,7 @@ spa_export_common(char *pool, int new_state, nvlist_t 
 	 * in which case we can modify its state.
 	 */
 	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
+
 		/*
 		 * Objsets may be open only because they're dirty, so we
 		 * have to force it to sync before checking spa_refcnt.
@@ -5381,6 +5396,18 @@ spa_export_common(char *pool, int new_state, nvlist_t 
 		}
 
 		/*
+		 * We're about to export or destroy this pool. Make sure
+		 * we stop all initializtion activity here before we
+		 * set the spa_final_txg. This will ensure that all
+		 * dirty data resulting from the initialization is
+		 * committed to disk before we unload the pool.
+		 */
+		if (spa->spa_root_vdev != NULL) {
+			vdev_initialize_stop_all(spa->spa_root_vdev,
+			    VDEV_INITIALIZE_ACTIVE);
+		}
+
+		/*
 		 * We want this to be reflected on every label,
 		 * so mark them all dirty.  spa_unload() will do the
 		 * final sync that pushes these changes out.
@@ -6070,6 +6097,86 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pg
 	return (error);
 }
 
+int
+spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type)
+{
+	/*
+	 * We hold the namespace lock through the whole function
+	 * to prevent any changes to the pool while we're starting or
+	 * stopping initialization. The config and state locks are held so that
+	 * we can properly assess the vdev state before we commit to
+	 * the initializing operation.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
+
+	/* Look up vdev and ensure it's a leaf. */
+	vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+	if (vd == NULL || vd->vdev_detached) {
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(ENODEV));
+	} else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(EINVAL));
+	} else if (!vdev_writeable(vd)) {
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(EROFS));
+	}
+	mutex_enter(&vd->vdev_initialize_lock);
+	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+
+	/*
+	 * When we activate an initialize action we check to see
+	 * if the vdev_initialize_thread is NULL. We do this instead
+	 * of using the vdev_initialize_state since there might be
+	 * a previous initialization process which has completed but
+	 * the thread is not exited.
+	 */
+	if (cmd_type == POOL_INITIALIZE_DO &&
+	    (vd->vdev_initialize_thread != NULL ||
+	    vd->vdev_top->vdev_removing)) {
+		mutex_exit(&vd->vdev_initialize_lock);
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(EBUSY));
+	} else if (cmd_type == POOL_INITIALIZE_CANCEL &&
+	    (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
+	    vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
+		mutex_exit(&vd->vdev_initialize_lock);
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(ESRCH));
+	} else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
+	    vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
+		mutex_exit(&vd->vdev_initialize_lock);
+		mutex_exit(&spa_namespace_lock);
+		return (SET_ERROR(ESRCH));
+	}
+
+	switch (cmd_type) {
+	case POOL_INITIALIZE_DO:
+		vdev_initialize(vd);
+		break;
+	case POOL_INITIALIZE_CANCEL:
+		vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
+		break;
+	case POOL_INITIALIZE_SUSPEND:
+		vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED);
+		break;
+	default:
+		panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
+	}
+	mutex_exit(&vd->vdev_initialize_lock);
+
+	/* Sync out the initializing state */
+	txg_wait_synced(spa->spa_dsl_pool, 0);
+	mutex_exit(&spa_namespace_lock);
+
+	return (0);
+}
+
+
 /*
  * Split a set of devices from their mirrors, and create a new pool from them.
  */
@@ -6277,6 +6384,19 @@ spa_vdev_split_mirror(spa_t *spa, char *newname, nvlis
 	spa_activate(newspa, spa_mode_global);
 	spa_async_suspend(newspa);
 
+	for (c = 0; c < children; c++) {
+		if (vml[c] != NULL) {
+			/*
+			 * Temporarily stop the initializing activity. We set
+			 * the state to ACTIVE so that we know to resume
+			 * the initializing once the split has completed.
+			 */
+			mutex_enter(&vml[c]->vdev_initialize_lock);
+			vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE);
+			mutex_exit(&vml[c]->vdev_initialize_lock);
+		}
+	}
+
 	newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
 
 	/* create the new pool from the disks of the original pool */
@@ -6364,6 +6484,10 @@ out:
 		if (vml[c] != NULL)
 			vml[c]->vdev_offline = B_FALSE;
 	}
+
+	/* restart initializing disks as necessary */
+	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
+
 	vdev_reopen(spa->spa_root_vdev);
 
 	nvlist_free(spa->spa_config_splitting);
@@ -6739,6 +6863,14 @@ spa_async_thread(void *arg)
 	if (tasks & SPA_ASYNC_RESILVER)
 		dsl_resilver_restart(spa->spa_dsl_pool, 0);
 
+	if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
+		mutex_enter(&spa_namespace_lock);
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		vdev_initialize_restart(spa->spa_root_vdev);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+		mutex_exit(&spa_namespace_lock);
+	}
+
 	/*
 	 * Let the world know that we're done.
 	 */
@@ -7384,8 +7516,9 @@ spa_sync(spa_t *spa, uint64_t txg)
 	 * Wait for i/os issued in open context that need to complete
 	 * before this txg syncs.
 	 */
-	VERIFY0(zio_wait(spa->spa_txg_zio[txg & TXG_MASK]));
-	spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, 0);
+	(void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
+	spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CANFAIL);
 
 	/*
 	 * Lock out configuration changes.
@@ -7674,7 +7807,8 @@ spa_sync(spa_t *spa, uint64_t txg)
 	/*
 	 * Update usable space statistics.
 	 */
-	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
+	while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
+	    != NULL)
 		vdev_sync_done(vd, txg);
 
 	spa_update_dspace(spa);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c	Tue Jul 31 18:32:57 2018	(r336990)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c	Tue Jul 31 18:49:07 2018	(r336991)
@@ -39,6 +39,7 @@
 #include <sys/zap.h>
 #include <sys/zil.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_initialize.h>
 #include <sys/metaslab.h>
 #include <sys/uberblock_impl.h>
 #include <sys/txg.h>
@@ -1196,6 +1197,12 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t 
 
 	if (vd != NULL) {
 		ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
+		if (vd->vdev_ops->vdev_op_leaf) {
+			mutex_enter(&vd->vdev_initialize_lock);
+			vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
+			mutex_exit(&vd->vdev_initialize_lock);
+		}
+
 		spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 		vdev_free(vd);
 		spa_config_exit(spa, SCL_ALL, spa);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab_impl.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab_impl.h	Tue Jul 31 18:32:57 2018	(r336990)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab_impl.h	Tue Jul 31 18:49:07 2018	(r336991)
@@ -68,7 +68,8 @@ typedef enum trace_alloc_type {
 	TRACE_GROUP_FAILURE	= -5ULL,
 	TRACE_ENOSPC		= -6ULL,
 	TRACE_CONDENSING	= -7ULL,
-	TRACE_VDEV_ERROR	= -8ULL
+	TRACE_VDEV_ERROR	= -8ULL,
+	TRACE_INITIALIZING	= -9ULL
 } trace_alloc_type_t;
 
 #define	METASLAB_WEIGHT_PRIMARY		(1ULL << 63)
@@ -270,6 +271,11 @@ struct metaslab_group {
 	uint64_t		mg_failed_allocations;
 	uint64_t		mg_fragmentation;
 	uint64_t		mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
+
+	int			mg_ms_initializing;
+	boolean_t		mg_initialize_updating;
+	kmutex_t		mg_ms_initialize_lock;
+	kcondvar_t		mg_ms_initialize_cv;
 };
 
 /*
@@ -359,6 +365,8 @@ struct metaslab {
 	boolean_t	ms_condensing;	/* condensing? */
 	boolean_t	ms_condense_wanted;
 	uint64_t	ms_condense_checked_txg;
+
+	uint64_t	ms_initializing; /* leaves initializing this ms */
 
 	/*
 	 * We must hold both ms_lock and ms_group->mg_lock in order to

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa.h	Tue Jul 31 18:32:57 2018	(r336990)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa.h	Tue Jul 31 18:49:07 2018	(r336991)
@@ -650,6 +650,7 @@ extern int spa_scan_get_stats(spa_t *spa, pool_scan_st
 #define	SPA_ASYNC_AUTOEXPAND	0x20
 #define	SPA_ASYNC_REMOVE_DONE	0x40
 #define	SPA_ASYNC_REMOVE_STOP	0x80
+#define	SPA_ASYNC_INITIALIZE_RESTART	0x100
 
 /*
  * Controls the behavior of spa_vdev_remove().
@@ -665,6 +666,7 @@ extern int spa_vdev_detach(spa_t *spa, uint64_t guid, 
     int replace_done);
 extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
 extern boolean_t spa_vdev_remove_active(spa_t *spa);
+extern int spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type);
 extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
 extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
 extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h	Tue Jul 31 18:32:57 2018	(r336990)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h	Tue Jul 31 18:49:07 2018	(r336991)
@@ -79,6 +79,12 @@ typedef void	vdev_remap_cb_t(uint64_t inner_offset, vd
     uint64_t offset, uint64_t size, void *arg);
 typedef void	vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size,
     vdev_remap_cb_t callback, void *arg);
+/*
+ * Given a target vdev, translates the logical range "in" to the physical
+ * range "res"
+ */
+typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg_t *in,
+    range_seg_t *res);
 
 typedef struct vdev_ops {
 	vdev_open_func_t		*vdev_op_open;
@@ -90,6 +96,11 @@ typedef struct vdev_ops {
 	vdev_hold_func_t		*vdev_op_hold;
 	vdev_rele_func_t		*vdev_op_rele;
 	vdev_remap_func_t		*vdev_op_remap;
+	/*
+	 * For translating ranges from non-leaf vdevs (e.g. raidz) to leaves.
+	 * Used when initializing vdevs. Isn't used by leaf ops.
+	 */
+	vdev_xlation_func_t		*vdev_op_xlate;
 	char				vdev_op_type[16];
 	boolean_t			vdev_op_leaf;
 } vdev_ops_t;
@@ -231,7 +242,25 @@ struct vdev {
 
 	/* pool checkpoint related */
 	space_map_t	*vdev_checkpoint_sm;	/* contains reserved blocks */
+	
+	boolean_t	vdev_initialize_exit_wanted;
+	vdev_initializing_state_t	vdev_initialize_state;
+	kthread_t	*vdev_initialize_thread;
+	/* Protects vdev_initialize_thread and vdev_initialize_state. */
+	kmutex_t	vdev_initialize_lock;
+	kcondvar_t	vdev_initialize_cv;
+	uint64_t	vdev_initialize_offset[TXG_SIZE];
+	uint64_t	vdev_initialize_last_offset;
+	range_tree_t	*vdev_initialize_tree;	/* valid while initializing */
+	uint64_t	vdev_initialize_bytes_est;
+	uint64_t	vdev_initialize_bytes_done;
+	time_t		vdev_initialize_action_time;	/* start and end time */
 
+	/* for limiting outstanding I/Os */
+	kmutex_t	vdev_initialize_io_lock;
+	kcondvar_t	vdev_initialize_io_cv;
+	uint64_t	vdev_initialize_inflight;
+
 	/*
 	 * Values stored in the config for an indirect or removing vdev.
 	 */
@@ -434,6 +463,8 @@ extern vdev_ops_t vdev_indirect_ops;
 /*
  * Common size functions
  */
+extern void vdev_default_xlate(vdev_t *vd, const range_seg_t *in,
+    range_seg_t *out);
 extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
 extern uint64_t vdev_get_min_asize(vdev_t *vd);
 extern void vdev_set_min_asize(vdev_t *vd);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio_priority.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio_priority.h	Tue Jul 31 18:32:57 2018	(r336990)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio_priority.h	Tue Jul 31 18:49:07 2018	(r336991)
@@ -13,7 +13,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
  */
 #ifndef	_ZIO_PRIORITY_H
 #define	_ZIO_PRIORITY_H
@@ -29,6 +29,7 @@ typedef enum zio_priority {
 	ZIO_PRIORITY_ASYNC_WRITE,	/* spa_sync() */
 	ZIO_PRIORITY_SCRUB,		/* asynchronous scrub/resilver reads */
 	ZIO_PRIORITY_REMOVAL,		/* reads/writes for vdev removal */
+	ZIO_PRIORITY_INITIALIZING,	/* initializing I/O */
 	ZIO_PRIORITY_NUM_QUEUEABLE,
 
 	ZIO_PRIORITY_NOW		/* non-queued i/os (e.g. free) */

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c	Tue Jul 31 18:32:57 2018	(r336990)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c	Tue Jul 31 18:49:07 2018	(r336991)
@@ -49,6 +49,7 @@
 #include <sys/zil.h>
 #include <sys/dsl_scan.h>
 #include <sys/abd.h>
+#include <sys/vdev_initialize.h>
 
 /*
  * Virtual device management.
@@ -183,6 +184,14 @@ vdev_getops(const char *type)
 	return (ops);
 }
 
+/* ARGSUSED */
+void
+vdev_default_xlate(vdev_t *vd, const range_seg_t *in, range_seg_t *res)
+{
+	res->rs_start = in->rs_start;
+	res->rs_end = in->rs_end;
+}
+
 /*
  * Default asize function: return the MAX of psize with the asize of
  * all children.  This is what's used by anything other than RAID-Z.
@@ -453,6 +462,11 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
+
 	for (int t = 0; t < DTL_TYPES; t++) {
 		vd->vdev_dtl[t] = range_tree_create(NULL, NULL);
 	}
@@ -725,6 +739,7 @@ void
 vdev_free(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
+	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
 
 	/*
 	 * vdev_free() implies closing the vdev first.  This is simpler than
@@ -743,6 +758,7 @@ vdev_free(vdev_t *vd)
 
 	ASSERT(vd->vdev_child == NULL);
 	ASSERT(vd->vdev_guid_sum == vd->vdev_guid);
+	ASSERT(vd->vdev_initialize_thread == NULL);
 
 	/*
 	 * Discard allocation state.
@@ -815,6 +831,10 @@ vdev_free(vdev_t *vd)
 	mutex_destroy(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_stat_lock);
 	mutex_destroy(&vd->vdev_probe_lock);
+	mutex_destroy(&vd->vdev_initialize_lock);
+	mutex_destroy(&vd->vdev_initialize_io_lock);
+	cv_destroy(&vd->vdev_initialize_io_cv);
+	cv_destroy(&vd->vdev_initialize_cv);
 
 	if (vd == spa->spa_root_vdev)
 		spa->spa_root_vdev = NULL;
@@ -2841,7 +2861,8 @@ vdev_sync_done(vdev_t *vd, uint64_t txg)
 
 	ASSERT(vdev_is_concrete(vd));
 
-	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
+	while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
+	    != NULL)
 		metaslab_sync_done(msp, txg);
 
 	if (reassess)
@@ -3067,6 +3088,15 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 	}
 
+	/* Restart initializing if necessary */
+	mutex_enter(&vd->vdev_initialize_lock);
+	if (vdev_writeable(vd) &&
+	    vd->vdev_initialize_thread == NULL &&
+	    vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) {
+		(void) vdev_initialize(vd);
+	}
+	mutex_exit(&vd->vdev_initialize_lock);
+
 	if (wasoffline ||
 	    (oldstate < VDEV_STATE_DEGRADED &&
 	    vd->vdev_state >= VDEV_STATE_DEGRADED))
@@ -3361,8 +3391,18 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
 	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
 	vs->vs_state = vd->vdev_state;
 	vs->vs_rsize = vdev_get_min_asize(vd);
-	if (vd->vdev_ops->vdev_op_leaf)
+	if (vd->vdev_ops->vdev_op_leaf) {
 		vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
+		/*
+		 * Report intializing progress. Since we don't have the
+		 * initializing locks held, this is only an estimate (although a
+		 * fairly accurate one).
+		 */
+		vs->vs_initialize_bytes_done = vd->vdev_initialize_bytes_done;
+		vs->vs_initialize_bytes_est = vd->vdev_initialize_bytes_est;
+		vs->vs_initialize_state = vd->vdev_initialize_state;
+		vs->vs_initialize_action_time = vd->vdev_initialize_action_time;
+	}
 	/*
 	 * Report expandable space on top-level, non-auxillary devices only.
 	 * The expandable space is reported in terms of metaslab sized units

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_disk.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_disk.c	Tue Jul 31 18:32:57 2018	(r336990)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_disk.c	Tue Jul 31 18:49:07 2018	(r336991)
@@ -840,6 +840,7 @@ vdev_ops_t vdev_disk_ops = {
 	vdev_disk_hold,
 	vdev_disk_rele,
 	NULL,
+	vdev_default_xlate,
 	VDEV_TYPE_DISK,		/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_file.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_file.c	Tue Jul 31 18:32:57 2018	(r336990)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_file.c	Tue Jul 31 18:49:07 2018	(r336991)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -263,6 +263,7 @@ vdev_ops_t vdev_file_ops = {
 	vdev_file_hold,
 	vdev_file_rele,
 	NULL,
+	vdev_default_xlate,
 	VDEV_TYPE_FILE,		/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
@@ -282,6 +283,7 @@ vdev_ops_t vdev_disk_ops = {
 	vdev_file_hold,
 	vdev_file_rele,
 	NULL,
+	vdev_default_xlate,
 	VDEV_TYPE_DISK,		/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c	Tue Jul 31 18:32:57 2018	(r336990)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c	Tue Jul 31 18:49:07 2018	(r336991)
@@ -1628,6 +1628,7 @@ vdev_ops_t vdev_indirect_ops = {
 	NULL,
 	NULL,
 	vdev_indirect_remap,
+	NULL,
 	VDEV_TYPE_INDIRECT,	/* name of this vdev type */
 	B_FALSE			/* leaf vdev */
 };

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_mirror.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_mirror.c	Tue Jul 31 18:32:57 2018	(r336990)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_mirror.c	Tue Jul 31 18:49:07 2018	(r336991)
@@ -564,6 +564,7 @@ vdev_ops_t vdev_mirror_ops = {
 	NULL,
 	NULL,
 	NULL,
+	vdev_default_xlate,
 	VDEV_TYPE_MIRROR,	/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };
@@ -578,6 +579,7 @@ vdev_ops_t vdev_replacing_ops = {
 	NULL,
 	NULL,
 	NULL,
+	vdev_default_xlate,
 	VDEV_TYPE_REPLACING,	/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };
@@ -592,6 +594,7 @@ vdev_ops_t vdev_spare_ops = {
 	NULL,
 	NULL,
 	NULL,
+	vdev_default_xlate,
 	VDEV_TYPE_SPARE,	/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_missing.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_missing.c	Tue Jul 31 18:32:57 2018	(r336990)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_missing.c	Tue Jul 31 18:49:07 2018	(r336991)
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  */
 
 /*
@@ -89,6 +89,7 @@ vdev_ops_t vdev_missing_ops = {
 	NULL,
 	NULL,
 	NULL,
+	NULL,
 	VDEV_TYPE_MISSING,	/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
@@ -99,6 +100,7 @@ vdev_ops_t vdev_hole_ops = {
 	vdev_default_asize,
 	vdev_missing_io_start,
 	vdev_missing_io_done,
+	NULL,
 	NULL,
 	NULL,
 	NULL,

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_queue.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_queue.c	Tue Jul 31 18:32:57 2018	(r336990)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_queue.c	Tue Jul 31 18:49:07 2018	(r336991)
@@ -150,6 +150,8 @@ uint32_t zfs_vdev_scrub_min_active = 1;
 uint32_t zfs_vdev_scrub_max_active = 2;
 uint32_t zfs_vdev_removal_min_active = 1;
 uint32_t zfs_vdev_removal_max_active = 2;
+uint32_t zfs_vdev_initializing_min_active = 1;
+uint32_t zfs_vdev_initializing_max_active = 1;
 
 /*
  * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
@@ -407,6 +409,8 @@ vdev_queue_class_min_active(zio_priority_t p)
 		return (zfs_vdev_scrub_min_active);
 	case ZIO_PRIORITY_REMOVAL:
 		return (zfs_vdev_removal_min_active);
+	case ZIO_PRIORITY_INITIALIZING:
+		return (zfs_vdev_initializing_min_active);
 	default:
 		panic("invalid priority %u", p);
 		return (0);
@@ -468,6 +472,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t
 		return (zfs_vdev_scrub_max_active);
 	case ZIO_PRIORITY_REMOVAL:
 		return (zfs_vdev_removal_max_active);
+	case ZIO_PRIORITY_INITIALIZING:
+		return (zfs_vdev_initializing_max_active);
 	default:
 		panic("invalid priority %u", p);
 		return (0);
@@ -688,8 +694,8 @@ again:
 	}
 
 	/*
-	 * For LBA-ordered queues (async / scrub), issue the i/o which follows
-	 * the most recently issued i/o in LBA (offset) order.
+	 * For LBA-ordered queues (async / scrub / initializing), issue the
+	 * i/o which follows the most recently issued i/o in LBA (offset) order.
 	 *
 	 * For FIFO queues (sync), issue the i/o with the lowest timestamp.
 	 */
@@ -745,13 +751,15 @@ vdev_queue_io(zio_t *zio)
 		if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
 		    zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
 		    zio->io_priority != ZIO_PRIORITY_SCRUB &&
-		    zio->io_priority != ZIO_PRIORITY_REMOVAL)
+		    zio->io_priority != ZIO_PRIORITY_REMOVAL &&
+		    zio->io_priority != ZIO_PRIORITY_INITIALIZING)
 			zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
 	} else {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
 		    zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
-		    zio->io_priority != ZIO_PRIORITY_REMOVAL)
+		    zio->io_priority != ZIO_PRIORITY_REMOVAL &&
+		    zio->io_priority != ZIO_PRIORITY_INITIALIZING)
 			zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
 	}
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_raidz.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_raidz.c	Tue Jul 31 18:32:57 2018	(r336990)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_raidz.c	Tue Jul 31 18:49:07 2018	(r336991)
@@ -38,6 +38,10 @@
 #include <sys/fs/zfs.h>
 #include <sys/fm/fs/zfs.h>
 
+#ifdef ZFS_DEBUG
+#include <sys/vdev_initialize.h>	/* vdev_xlate testing */
+#endif
+
 /*
  * Virtual device vector for RAID-Z.
  *
@@ -1884,6 +1888,39 @@ vdev_raidz_child_done(zio_t *zio)
 	rc->rc_skipped = 0;
 }
 
+static void
+vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col)
+{
+#ifdef ZFS_DEBUG
+	vdev_t *vd = zio->io_vd;
+	vdev_t *tvd = vd->vdev_top;
+
+	range_seg_t logical_rs, physical_rs;
+	logical_rs.rs_start = zio->io_offset;
+	logical_rs.rs_end = logical_rs.rs_start +
+	    vdev_raidz_asize(zio->io_vd, zio->io_size);
+
+	raidz_col_t *rc = &rm->rm_col[col];
+	vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
+
+	vdev_xlate(cvd, &logical_rs, &physical_rs);
+	ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
+	ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
+	/*
+	 * It would be nice to assert that rs_end is equal
+	 * to rc_offset + rc_size but there might be an
+	 * optional I/O at the end that is not accounted in
+	 * rc_size.
+	 */
+	if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
+		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
+		    rc->rc_size + (1 << tvd->vdev_ashift));
+	} else {
+		ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
+	}
+#endif
+}
+
 /*
  * Start an IO operation on a RAIDZ VDev
  *
@@ -1926,6 +1963,12 @@ vdev_raidz_io_start(zio_t *zio)
 		for (c = 0; c < rm->rm_cols; c++) {
 			rc = &rm->rm_col[c];
 			cvd = vd->vdev_child[rc->rc_devidx];
+
+			/*
+			 * Verify physical to logical translation.
+			 */
+			vdev_raidz_io_verify(zio, rm, c);
+
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_abd, rc->rc_size,
 			    zio->io_type, zio->io_priority, 0,
@@ -2555,6 +2598,37 @@ vdev_raidz_state_change(vdev_t *vd, int faulted, int d
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
 }
 
+static void
+vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res)
+{
+	vdev_t *raidvd = cvd->vdev_parent;
+	ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
+
+	uint64_t width = raidvd->vdev_children;
+	uint64_t tgt_col = cvd->vdev_id;
+	uint64_t ashift = raidvd->vdev_top->vdev_ashift;
+
+	/* make sure the offsets are block-aligned */
+	ASSERT0(in->rs_start % (1 << ashift));
+	ASSERT0(in->rs_end % (1 << ashift));
+	uint64_t b_start = in->rs_start >> ashift;
+	uint64_t b_end = in->rs_end >> ashift;
+
+	uint64_t start_row = 0;
+	if (b_start > tgt_col) /* avoid underflow */
+		start_row = ((b_start - tgt_col - 1) / width) + 1;
+
+	uint64_t end_row = 0;
+	if (b_end > tgt_col)
+		end_row = ((b_end - tgt_col - 1) / width) + 1;
+
+	res->rs_start = start_row << ashift;
+	res->rs_end = end_row << ashift;
+
+	ASSERT3U(res->rs_start, <=, in->rs_start);
+	ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start);
+}
+
 vdev_ops_t vdev_raidz_ops = {
 	vdev_raidz_open,
 	vdev_raidz_close,
@@ -2565,6 +2639,7 @@ vdev_ops_t vdev_raidz_ops = {
 	NULL,
 	NULL,
 	NULL,
+	vdev_raidz_xlate,
 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
 	B_FALSE			/* not a leaf vdev */
 };

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_removal.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_removal.c	Tue Jul 31 18:32:57 2018	(r336990)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_removal.c	Tue Jul 31 18:49:07 2018	(r336991)
@@ -44,6 +44,7 @@
 #include <sys/vdev_indirect_births.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/abd.h>
+#include <sys/vdev_initialize.h>
 
 /*
  * This file contains the necessary logic to remove vdevs from a
@@ -1021,6 +1022,7 @@ vdev_remove_complete(spa_t *spa)
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 	txg = spa_vdev_enter(spa);
 	vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id);
+	ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
 
 	sysevent_t *ev = spa_event_create(spa, vd, NULL,
 	    ESC_ZFS_VDEV_REMOVE_DEV);
@@ -1659,6 +1661,9 @@ spa_vdev_remove_log(vdev_t *vd, uint64_t *txg)
 	/* Make sure these changes are sync'ed */
 	spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG);
 
+	/* Stop initializing */
+	(void) vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED);
+
 	*txg = spa_vdev_config_enter(spa);
 
 	sysevent_t *ev = spa_event_create(spa, vd, NULL,
@@ -1819,6 +1824,13 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
 	 */
 	error = spa_reset_logs(spa);
 
+	/*
+	 * We stop any initializing that is currently in progress but leave
+	 * the state as "active". This will allow the initializing to resume
+	 * if the removal is canceled sometime later.
+	 */
+	vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE);
+
 	*txg = spa_vdev_config_enter(spa);
 
 	/*
@@ -1830,6 +1842,7 @@ spa_vdev_remove_top(vdev_t *vd, uint64_t *txg)
 
 	if (error != 0) {
 		metaslab_group_activate(mg);
+		spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
 		return (error);
 	}
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_root.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_root.c	Tue Jul 31 18:32:57 2018	(r336990)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_root.c	Tue Jul 31 18:49:07 2018	(r336991)
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -146,6 +146,7 @@ vdev_ops_t vdev_root_ops = {
 	NULL,			/* io_start - not applicable to the root */
 	NULL,			/* io_done - not applicable to the root */
 	vdev_root_state_change,
+	NULL,
 	NULL,
 	NULL,
 	NULL,

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ioctl.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ioctl.c	Tue Jul 31 18:32:57 2018	(r336990)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ioctl.c	Tue Jul 31 18:49:07 2018	(r336991)
@@ -189,6 +189,8 @@
 #include <sys/zcp.h>
 #include <sys/zio_checksum.h>
 #include <sys/vdev_removal.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_initialize.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
@@ -3707,6 +3709,80 @@ zfs_ioc_destroy(zfs_cmd_t *zc)
 }
 
 /*
+ * innvl: {
+ *     vdevs: {
+ *         guid 1, guid 2, ...
+ *     },
+ *     func: POOL_INITIALIZE_{CANCEL|DO|SUSPEND}
+ * }
+ *
+ * outnvl: {
+ *     [func: EINVAL (if provided command type didn't make sense)],
+ *     [vdevs: {
+ *         guid1: errno, (see function body for possible errnos)

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201807311849.w6VIn7Xt065532>