Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 2 Aug 2018 21:59:47 +0000 (UTC)
From:      Alexander Motin <mav@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r337191 - in head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs: . sys
Message-ID:  <201808022159.w72LxlaQ065450@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mav
Date: Thu Aug  2 21:59:46 2018
New Revision: 337191
URL: https://svnweb.freebsd.org/changeset/base/337191

Log:
  MFV r337190: 9486 reduce memory used by device removal on fragmented pools
  
  In the most fragmented real-world cases, this reduces memory used by the
  mapping from ~1GB to ~50MB of RAM per 1TB of storage removed. Less
  fragmented cases will typically also see around 50-100MB of RAM per 1TB
  of storage.
  
  illumos/illumos-gate@cfd63e1b1bcf7ba4bf72f55ddbd87ce008d2986d
  
  Reviewed by: George Wilson <george.wilson@delphix.com>
  Reviewed by: Serapheim Dimitropoulos <serapheim.dimitro@delphix.com>
  Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
  Reviewed by: Tim Chase <tim@chase2k.com>
  Approved by: Robert Mustacchi <rm@joyent.com>
  Author:     Matthew Ahrens <mahrens@delphix.com>

Modified:
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
Directory Properties:
  head/sys/cddl/contrib/opensolaris/   (props changed)

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c	Thu Aug  2 21:57:59 2018	(r337190)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c	Thu Aug  2 21:59:46 2018	(r337191)
@@ -491,7 +491,6 @@ range_tree_resize_segment(range_tree_t *rt, range_seg_
 static range_seg_t *
 range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
 {
-	avl_index_t where;
 	range_seg_t rsearch;
 	uint64_t end = start + size;
 
@@ -499,7 +498,7 @@ range_tree_find_impl(range_tree_t *rt, uint64_t start,
 
 	rsearch.rs_start = start;
 	rsearch.rs_end = end;
-	return (avl_find(&rt->rt_root, &rsearch, &where));
+	return (avl_find(&rt->rt_root, &rsearch, NULL));
 }
 
 range_seg_t *
@@ -650,4 +649,24 @@ range_tree_is_empty(range_tree_t *rt)
 {
 	ASSERT(rt != NULL);
 	return (range_tree_space(rt) == 0);
+}
+
+uint64_t
+range_tree_min(range_tree_t *rt)
+{
+	range_seg_t *rs = avl_first(&rt->rt_root);
+	return (rs != NULL ? rs->rs_start : 0);
+}
+
+uint64_t
+range_tree_max(range_tree_t *rt)
+{
+	range_seg_t *rs = avl_last(&rt->rt_root);
+	return (rs != NULL ? rs->rs_end : 0);
+}
+
+uint64_t
+range_tree_span(range_tree_t *rt)
+{
+	return (range_tree_max(rt) - range_tree_min(rt));
 }

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h	Thu Aug  2 21:57:59 2018	(r337190)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h	Thu Aug  2 21:59:46 2018	(r337191)
@@ -95,6 +95,9 @@ boolean_t range_tree_is_empty(range_tree_t *rt);
 void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
 void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
 void range_tree_stat_verify(range_tree_t *rt);
+uint64_t range_tree_min(range_tree_t *rt);
+uint64_t range_tree_max(range_tree_t *rt);
+uint64_t range_tree_span(range_tree_t *rt);
 
 void range_tree_add(void *arg, uint64_t start, uint64_t size);
 void range_tree_remove(void *arg, uint64_t start, uint64_t size);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h	Thu Aug  2 21:57:59 2018	(r337190)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h	Thu Aug  2 21:59:46 2018	(r337191)
@@ -86,6 +86,9 @@ extern void spa_vdev_remove_suspend(spa_t *);
 extern int spa_vdev_remove_cancel(spa_t *);
 extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr);
 
+extern int vdev_removal_max_span;
+extern int zfs_remove_max_segment;
+
 #ifdef	__cplusplus
 }
 #endif

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c	Thu Aug  2 21:57:59 2018	(r337190)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c	Thu Aug  2 21:59:46 2018	(r337191)
@@ -33,15 +33,15 @@
  *	1. Uniquely identify this device as part of a ZFS pool and confirm its
  *	   identity within the pool.
  *
- * 	2. Verify that all the devices given in a configuration are present
+ *	2. Verify that all the devices given in a configuration are present
  *         within the pool.
  *
- * 	3. Determine the uberblock for the pool.
+ *	3. Determine the uberblock for the pool.
  *
- * 	4. In case of an import operation, determine the configuration of the
+ *	4. In case of an import operation, determine the configuration of the
  *         toplevel vdev of which it is a part.
  *
- * 	5. If an import operation cannot find all the devices in the pool,
+ *	5. If an import operation cannot find all the devices in the pool,
  *         provide enough information to the administrator to determine which
  *         devices are missing.
  *
@@ -77,9 +77,9 @@
  * In order to identify which labels are valid, the labels are written in the
  * following manner:
  *
- * 	1. For each vdev, update 'L1' to the new label
- * 	2. Update the uberblock
- * 	3. For each vdev, update 'L2' to the new label
+ *	1. For each vdev, update 'L1' to the new label
+ *	2. Update the uberblock
+ *	3. For each vdev, update 'L2' to the new label
  *
  * Given arbitrary failure, we can determine the correct label to use based on
  * the transaction group.  If we fail after updating L1 but before updating the
@@ -117,19 +117,19 @@
  *
  * The nvlist describing the pool and vdev contains the following elements:
  *
- * 	version		ZFS on-disk version
- * 	name		Pool name
- * 	state		Pool state
- * 	txg		Transaction group in which this label was written
- * 	pool_guid	Unique identifier for this pool
- * 	vdev_tree	An nvlist describing vdev tree.
+ *	version		ZFS on-disk version
+ *	name		Pool name
+ *	state		Pool state
+ *	txg		Transaction group in which this label was written
+ *	pool_guid	Unique identifier for this pool
+ *	vdev_tree	An nvlist describing vdev tree.
  *	features_for_read
  *			An nvlist of the features necessary for reading the MOS.
  *
  * Each leaf device label also contains the following:
  *
- * 	top_guid	Unique ID for top-level vdev in which this is contained
- * 	guid		Unique ID for the leaf vdev
+ *	top_guid	Unique ID for top-level vdev in which this is contained
+ *	guid		Unique ID for the leaf vdev
  *
  * The 'vs' configuration follows the format described in 'spa_config.c'.
  */
@@ -396,22 +396,33 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t
 			 * histograms.
 			 */
 			uint64_t seg_count = 0;
+			uint64_t to_alloc = vd->vdev_stat.vs_alloc;
 
 			/*
 			 * There are the same number of allocated segments
 			 * as free segments, so we will have at least one
-			 * entry per free segment.
+			 * entry per free segment.  However, small free
+			 * segments (smaller than vdev_removal_max_span)
+			 * will be combined with adjacent allocated segments
+			 * as a single mapping.
 			 */
 			for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
-				seg_count += vd->vdev_mg->mg_histogram[i];
+				if (1ULL << (i + 1) < vdev_removal_max_span) {
+					to_alloc +=
+					    vd->vdev_mg->mg_histogram[i] <<
+					    i + 1;
+				} else {
+					seg_count +=
+					    vd->vdev_mg->mg_histogram[i];
+				}
 			}
 
 			/*
-			 * The maximum length of a mapping is SPA_MAXBLOCKSIZE,
-			 * so we need at least one entry per SPA_MAXBLOCKSIZE
-			 * of allocated data.
+			 * The maximum length of a mapping is
+			 * zfs_remove_max_segment, so we need at least one entry
+			 * per zfs_remove_max_segment of allocated data.
 			 */
-			seg_count += vd->vdev_stat.vs_alloc / SPA_MAXBLOCKSIZE;
+			seg_count += to_alloc / zfs_remove_max_segment;
 
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
 			    seg_count *

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c	Thu Aug  2 21:57:59 2018	(r337190)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c	Thu Aug  2 21:59:46 2018	(r337191)
@@ -106,6 +106,24 @@ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
 int zfs_remove_max_segment = 1024 * 1024;
 
 /*
+ * Allow a remap segment to span free chunks of at most this size. The main
+ * impact of a larger span is that we will read and write larger, more
+ * contiguous chunks, with more "unnecessary" data -- trading off bandwidth
+ * for iops.  The value here was chosen to align with
+ * zfs_vdev_read_gap_limit, which is a similar concept when doing regular
+ * reads (but there's no reason it has to be the same).
+ *
+ * Additionally, a higher span will have the following relatively minor
+ * effects:
+ *  - the mapping will be smaller, since one entry can cover more allocated
+ *    segments
+ *  - more of the fragmentation in the removing device will be preserved
+ *  - we'll do larger allocations, which may fail and fall back on smaller
+ *    allocations
+ */
+int vdev_removal_max_span = 32 * 1024;
+
+/*
  * This is used by the test suite so that it can ensure that certain
  * actions happen while in the middle of a removal.
  */
@@ -726,13 +744,52 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx)
 	spa_sync_removing_state(spa, tx);
 }
 
+typedef struct vdev_copy_segment_arg {
+	spa_t *vcsa_spa;
+	dva_t *vcsa_dest_dva;
+	uint64_t vcsa_txg;
+	range_tree_t *vcsa_obsolete_segs;
+} vdev_copy_segment_arg_t;
+
+static void
+unalloc_seg(void *arg, uint64_t start, uint64_t size)
+{
+	vdev_copy_segment_arg_t *vcsa = arg;
+	spa_t *spa = vcsa->vcsa_spa;
+	blkptr_t bp = { 0 };
+
+	BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL);
+	BP_SET_LSIZE(&bp, size);
+	BP_SET_PSIZE(&bp, size);
+	BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+	BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF);
+	BP_SET_TYPE(&bp, DMU_OT_NONE);
+	BP_SET_LEVEL(&bp, 0);
+	BP_SET_DEDUP(&bp, 0);
+	BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER);
+
+	DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva));
+	DVA_SET_OFFSET(&bp.blk_dva[0],
+	    DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start);
+	DVA_SET_ASIZE(&bp.blk_dva[0], size);
+
+	zio_free(spa, vcsa->vcsa_txg, &bp);
+}
+
 /*
  * All reads and writes associated with a call to spa_vdev_copy_segment()
  * are done.
  */
 static void
-spa_vdev_copy_nullzio_done(zio_t *zio)
+spa_vdev_copy_segment_done(zio_t *zio)
 {
+	vdev_copy_segment_arg_t *vcsa = zio->io_private;
+
+	range_tree_vacate(vcsa->vcsa_obsolete_segs,
+	    unalloc_seg, vcsa);
+	range_tree_destroy(vcsa->vcsa_obsolete_segs);
+	kmem_free(vcsa, sizeof (*vcsa));
+
 	spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
 }
 
@@ -849,7 +906,8 @@ spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *n
  * read from the old location and write to the new location.
  */
 static int
-spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
+spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
+    uint64_t maxalloc, uint64_t txg,
     vdev_copy_arg_t *vca, zio_alloc_list_t *zal)
 {
 	metaslab_group_t *mg = vd->vdev_mg;
@@ -857,9 +915,40 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint
 	spa_vdev_removal_t *svr = spa->spa_vdev_removal;
 	vdev_indirect_mapping_entry_t *entry;
 	dva_t dst = { 0 };
+	uint64_t start = range_tree_min(segs);
 
-	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+	ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE);
 
+	uint64_t size = range_tree_span(segs);
+	if (range_tree_span(segs) > maxalloc) {
+		/*
+		 * We can't allocate all the segments.  Prefer to end
+		 * the allocation at the end of a segment, thus avoiding
+		 * additional split blocks.
+		 */
+		range_seg_t search;
+		avl_index_t where;
+		search.rs_start = start + maxalloc;
+		search.rs_end = search.rs_start;
+		range_seg_t *rs = avl_find(&segs->rt_root, &search, &where);
+		if (rs == NULL) {
+			rs = avl_nearest(&segs->rt_root, where, AVL_BEFORE);
+		} else {
+			rs = AVL_PREV(&segs->rt_root, rs);
+		}
+		if (rs != NULL) {
+			size = rs->rs_end - start;
+		} else {
+			/*
+			 * There are no segments that end before maxalloc.
+			 * I.e. the first segment is larger than maxalloc,
+			 * so we must split it.
+			 */
+			size = maxalloc;
+		}
+	}
+	ASSERT3U(size, <=, maxalloc);
+
 	/*
 	 * We use allocator 0 for this I/O because we don't expect device remap
 	 * to be the steady state of the system, so parallelizing is not as
@@ -873,6 +962,31 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint
 		return (error);
 
 	/*
+	 * Determine the ranges that are not actually needed.  Offsets are
+	 * relative to the start of the range to be copied (i.e. relative to the
+	 * local variable "start").
+	 */
+	range_tree_t *obsolete_segs = range_tree_create(NULL, NULL);
+
+	range_seg_t *rs = avl_first(&segs->rt_root);
+	ASSERT3U(rs->rs_start, ==, start);
+	uint64_t prev_seg_end = rs->rs_end;
+	while ((rs = AVL_NEXT(&segs->rt_root, rs)) != NULL) {
+		if (rs->rs_start >= start + size) {
+			break;
+		} else {
+			range_tree_add(obsolete_segs,
+			    prev_seg_end - start,
+			    rs->rs_start - prev_seg_end);
+		}
+		prev_seg_end = rs->rs_end;
+	}
+	/* We don't end in the middle of an obsolete range */
+	ASSERT3U(start + size, <=, prev_seg_end);
+
+	range_tree_clear(segs, start, size);
+
+	/*
 	 * We can't have any padding of the allocated size, otherwise we will
 	 * misunderstand what's allocated, and the size of the mapping.
 	 * The caller ensures this will be true by passing in a size that is
@@ -883,13 +997,22 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint
 	entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
 	DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
 	entry->vime_mapping.vimep_dst = dst;
+	if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+		entry->vime_obsolete_count = range_tree_space(obsolete_segs);
+	}
 
+	vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP);
+	vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
+	vcsa->vcsa_obsolete_segs = obsolete_segs;
+	vcsa->vcsa_spa = spa;
+	vcsa->vcsa_txg = txg;
+
 	/*
 	 * See comment before spa_vdev_copy_one_child().
 	 */
 	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
 	zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL,
-	    spa_vdev_copy_nullzio_done, NULL, 0);
+	    spa_vdev_copy_segment_done, vcsa, 0);
 	vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst));
 	if (dest_vd->vdev_ops == &vdev_mirror_ops) {
 		for (int i = 0; i < dest_vd->vdev_children; i++) {
@@ -1092,39 +1215,78 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr
 
 	mutex_enter(&svr->svr_lock);
 
-	range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root);
-	if (rs == NULL) {
+	/*
+	 * Determine how big of a chunk to copy.  We can allocate up
+	 * to max_alloc bytes, and we can span up to vdev_removal_max_span
+	 * bytes of unallocated space at a time.  "segs" will track the
+	 * allocated segments that we are copying.  We may also be copying
+	 * free segments (of up to vdev_removal_max_span bytes).
+	 */
+	range_tree_t *segs = range_tree_create(NULL, NULL);
+	for (;;) {
+		range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root);
+		if (rs == NULL)
+			break;
+
+		uint64_t seg_length;
+
+		if (range_tree_is_empty(segs)) {
+			/* need to truncate the first seg based on max_alloc */
+			seg_length =
+			    MIN(rs->rs_end - rs->rs_start, *max_alloc);
+		} else {
+			if (rs->rs_start - range_tree_max(segs) >
+			    vdev_removal_max_span) {
+				/*
+				 * Including this segment would cause us to
+				 * copy a larger unneeded chunk than is allowed.
+				 */
+				break;
+			} else if (rs->rs_end - range_tree_min(segs) >
+			    *max_alloc) {
+				/*
+				 * This additional segment would extend past
+				 * max_alloc. Rather than splitting this
+				 * segment, leave it for the next mapping.
+				 */
+				break;
+			} else {
+				seg_length = rs->rs_end - rs->rs_start;
+			}
+		}
+
+		range_tree_add(segs, rs->rs_start, seg_length);
+		range_tree_remove(svr->svr_allocd_segs,
+		    rs->rs_start, seg_length);
+	}
+
+	if (range_tree_is_empty(segs)) {
 		mutex_exit(&svr->svr_lock);
+		range_tree_destroy(segs);
 		return;
 	}
-	uint64_t offset = rs->rs_start;
-	uint64_t length = MIN(rs->rs_end - rs->rs_start, *max_alloc);
 
-	range_tree_remove(svr->svr_allocd_segs, offset, length);
-
 	if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) {
 		dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync,
 		    svr, 0, ZFS_SPACE_CHECK_NONE, tx);
 	}
 
-	svr->svr_max_offset_to_sync[txg & TXG_MASK] = offset + length;
+	svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs);
 
 	/*
 	 * Note: this is the amount of *allocated* space
 	 * that we are taking care of each txg.
 	 */
-	svr->svr_bytes_done[txg & TXG_MASK] += length;
+	svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs);
 
 	mutex_exit(&svr->svr_lock);
 
 	zio_alloc_list_t zal;
 	metaslab_trace_init(&zal);
-	uint64_t thismax = *max_alloc;
-	while (length > 0) {
-		uint64_t mylen = MIN(length, thismax);
-
+	uint64_t thismax = SPA_MAXBLOCKSIZE;
+	while (!range_tree_is_empty(segs)) {
 		int error = spa_vdev_copy_segment(vd,
-		    offset, mylen, txg, vca, &zal);
+		    segs, thismax, txg, vca, &zal);
 
 		if (error == ENOSPC) {
 			/*
@@ -1138,18 +1300,17 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr
 			 */
 			ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT);
 			ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift);
-			thismax = P2ROUNDUP(mylen / 2,
+			uint64_t attempted =
+			    MIN(range_tree_span(segs), thismax);
+			thismax = P2ROUNDUP(attempted / 2,
 			    1 << spa->spa_max_ashift);
-			ASSERT3U(thismax, <, mylen);
 			/*
 			 * The minimum-size allocation can not fail.
 			 */
-			ASSERT3U(mylen, >, 1 << spa->spa_max_ashift);
-			*max_alloc = mylen - (1 << spa->spa_max_ashift);
+			ASSERT3U(attempted, >, 1 << spa->spa_max_ashift);
+			*max_alloc = attempted - (1 << spa->spa_max_ashift);
 		} else {
 			ASSERT0(error);
-			length -= mylen;
-			offset += mylen;
 
 			/*
 			 * We've performed an allocation, so reset the
@@ -1160,6 +1321,7 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr
 		}
 	}
 	metaslab_trace_fini(&zal);
+	range_tree_destroy(segs);
 }
 
 /*



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201808022159.w72LxlaQ065450>