Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 1 Aug 2011 08:17:54 +0000 (UTC)
From:      Martin Matuska <mm@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-8@freebsd.org
Subject:   svn commit: r224565 - in stable/8: cddl/contrib/opensolaris/cmd/ztest sys/cddl/contrib/opensolaris/uts/common/fs/zfs sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys
Message-ID:  <201108010817.p718Hs5I084439@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mm
Date: Mon Aug  1 08:17:54 2011
New Revision: 224565
URL: http://svn.freebsd.org/changeset/base/224565

Log:
  MFC 224177:
  ZFS tries to allocate blocks evenly across all devices. This means when
  devices are imbalanced zfs will lots of CPU searching for space on devices
  which tend to be pretty full. It should instead fail quickly on the full
  devices and move onto devices which have more availability.
  
  New loader tunable: vfs.zfs.mg_alloc_failures (min = 8)
  
  Illumos-gate changeset:	13379:4df42cc92254
  
  Obtained from:	Illumos (Bug #1051)

Modified:
  stable/8/cddl/contrib/opensolaris/cmd/ztest/ztest.c
  stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
  stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
  stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
  stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
  stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
  stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
  stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
Directory Properties:
  stable/8/cddl/contrib/opensolaris/   (props changed)
  stable/8/sys/   (props changed)
  stable/8/sys/amd64/include/xen/   (props changed)
  stable/8/sys/cddl/contrib/opensolaris/   (props changed)
  stable/8/sys/contrib/dev/acpica/   (props changed)
  stable/8/sys/contrib/pf/   (props changed)

Modified: stable/8/cddl/contrib/opensolaris/cmd/ztest/ztest.c
==============================================================================
--- stable/8/cddl/contrib/opensolaris/cmd/ztest/ztest.c	Mon Aug  1 08:14:16 2011	(r224564)
+++ stable/8/cddl/contrib/opensolaris/cmd/ztest/ztest.c	Mon Aug  1 08:17:54 2011	(r224565)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 /*
@@ -5137,6 +5138,7 @@ ztest_run(ztest_shared_t *zs)
 	 */
 	kernel_init(FREAD | FWRITE);
 	VERIFY(spa_open(zs->zs_pool, &spa, FTAG) == 0);
+	spa->spa_debug = B_TRUE;
 	zs->zs_spa = spa;
 
 	spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;

Modified: stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
==============================================================================
--- stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c	Mon Aug  1 08:14:16 2011	(r224564)
+++ stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c	Mon Aug  1 08:17:54 2011	(r224565)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -30,10 +31,35 @@
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 
+/*
+ * Allow allocations to switch to gang blocks quickly. We do this to
+ * avoid having to load lots of space_maps in a given txg. There are,
+ * however, some cases where we want to avoid "fast" ganging and instead
+ * we want to do an exhaustive search of all metaslabs on this device.
+ * Currently we don't allow any gang or dump device related allocations
+ * to "fast" gang.
+ */
+#define	CAN_FASTGANG(flags) \
+	(!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
+	METASLAB_GANG_AVOID)))
+
 uint64_t metaslab_aliquot = 512ULL << 10;
 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
 
 /*
+ * This value defines the number of allowed allocation failures per vdev.
+ * If a device reaches this threshold in a given txg then we consider skipping
+ * allocations on that device.
+ */
+int zfs_mg_alloc_failures = 0;
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_alloc_failures, CTLFLAG_RDTUN,
+    &zfs_mg_alloc_failures, 0,
+    "Number of allowed allocation failures per vdev");
+TUNABLE_INT("vfs.zfs.mg_alloc_failures", &zfs_mg_alloc_failures);
+
+/*
  * Metaslab debugging: when set, keeps all space maps in core to verify frees.
  */
 static int metaslab_debug = 0;
@@ -671,7 +697,7 @@ static space_map_ops_t metaslab_ndf_ops 
 	metaslab_ndf_fragmented
 };
 
-space_map_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
+space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 
 /*
  * ==========================================================================
@@ -844,7 +870,7 @@ metaslab_prefetch(metaslab_group_t *mg)
 }
 
 static int
-metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
+metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	space_map_t *sm = &msp->ms_map;
@@ -877,13 +903,6 @@ metaslab_activate(metaslab_t *msp, uint6
 			mutex_exit(&mg->mg_lock);
 		}
 
-		/*
-		 * If we were able to load the map then make sure
-		 * that this map is still able to satisfy our request.
-		 */
-		if (msp->ms_weight < size)
-			return (ENOSPC);
-
 		metaslab_group_sort(msp->ms_group, msp,
 		    msp->ms_weight | activation_weight);
 	}
@@ -1099,6 +1118,7 @@ void
 metaslab_sync_reassess(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
+	int64_t failures = mg->mg_alloc_failures;
 
 	/*
 	 * Re-evaluate all metaslabs which have lower offsets than the
@@ -1115,6 +1135,8 @@ metaslab_sync_reassess(metaslab_group_t 
 		mutex_exit(&msp->ms_lock);
 	}
 
+	atomic_add_64(&mg->mg_alloc_failures, -failures);
+
 	/*
 	 * Prefetch the next potential metaslabs
 	 */
@@ -1139,9 +1161,10 @@ metaslab_distance(metaslab_t *msp, dva_t
 }
 
 static uint64_t
-metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
-    uint64_t min_distance, dva_t *dva, int d)
+metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
+    uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
 {
+	spa_t *spa = mg->mg_vd->vdev_spa;
 	metaslab_t *msp = NULL;
 	uint64_t offset = -1ULL;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
@@ -1162,11 +1185,17 @@ metaslab_group_alloc(metaslab_group_t *m
 
 		mutex_enter(&mg->mg_lock);
 		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
-			if (msp->ms_weight < size) {
+			if (msp->ms_weight < asize) {
+				spa_dbgmsg(spa, "%s: failed to meet weight "
+				    "requirement: vdev %llu, txg %llu, mg %p, "
+				    "msp %p, psize %llu, asize %llu, "
+				    "failures %llu, weight %llu",
+				    spa_name(spa), mg->mg_vd->vdev_id, txg,
+				    mg, msp, psize, asize,
+				    mg->mg_alloc_failures, msp->ms_weight);
 				mutex_exit(&mg->mg_lock);
 				return (-1ULL);
 			}
-
 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
 				break;
@@ -1185,6 +1214,25 @@ metaslab_group_alloc(metaslab_group_t *m
 		if (msp == NULL)
 			return (-1ULL);
 
+		/*
+		 * If we've already reached the allowable number of failed
+		 * allocation attempts on this metaslab group then we
+		 * consider skipping it. We skip it only if we're allowed
+		 * to "fast" gang, the physical size is larger than
+		 * a gang block, and we're attempting to allocate from
+		 * the primary metaslab.
+		 */
+		if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
+		    CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
+		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
+			spa_dbgmsg(spa, "%s: skipping metaslab group: "
+			    "vdev %llu, txg %llu, mg %p, psize %llu, "
+			    "asize %llu, failures %llu", spa_name(spa),
+			    mg->mg_vd->vdev_id, txg, mg, psize, asize,
+			    mg->mg_alloc_failures);
+			return (-1ULL);
+		}
+
 		mutex_enter(&msp->ms_lock);
 
 		/*
@@ -1193,7 +1241,7 @@ metaslab_group_alloc(metaslab_group_t *m
 		 * another thread may have changed the weight while we
 		 * were blocked on the metaslab lock.
 		 */
-		if (msp->ms_weight < size || (was_active &&
+		if (msp->ms_weight < asize || (was_active &&
 		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
 		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
 			mutex_exit(&msp->ms_lock);
@@ -1208,14 +1256,16 @@ metaslab_group_alloc(metaslab_group_t *m
 			continue;
 		}
 
-		if (metaslab_activate(msp, activation_weight, size) != 0) {
+		if (metaslab_activate(msp, activation_weight) != 0) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
-		if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
+		if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL)
 			break;
 
+		atomic_inc_64(&mg->mg_alloc_failures);
+
 		metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
 
 		mutex_exit(&msp->ms_lock);
@@ -1224,7 +1274,7 @@ metaslab_group_alloc(metaslab_group_t *m
 	if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
 		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
 
-	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize);
 
 	mutex_exit(&msp->ms_lock);
 
@@ -1351,7 +1401,8 @@ top:
 		asize = vdev_psize_to_asize(vd, psize);
 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
 
-		offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
+		offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
+		    dva, d, flags);
 		if (offset != -1ULL) {
 			/*
 			 * If we've just selected this metaslab group,
@@ -1363,18 +1414,24 @@ top:
 				vdev_stat_t *vs = &vd->vdev_stat;
 				int64_t vu, cu;
 
-				/*
-				 * Determine percent used in units of 0..1024.
-				 * (This is just to avoid floating point.)
-				 */
-				vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
-				cu = (mc->mc_alloc << 10) / (mc->mc_space + 1);
+				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
+				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
 
 				/*
-				 * Bias by at most +/- 25% of the aliquot.
+				 * Calculate how much more or less we should
+				 * try to allocate from this device during
+				 * this iteration around the rotor.
+				 * For example, if a device is 80% full
+				 * and the pool is 20% full then we should
+				 * reduce allocations by 60% on this device.
+				 *
+				 * mg_bias = (20 - 80) * 512K / 100 = -307K
+				 *
+				 * This reduces allocations by 307K for this
+				 * iteration.
 				 */
 				mg->mg_bias = ((cu - vu) *
-				    (int64_t)mg->mg_aliquot) / (1024 * 4);
+				    (int64_t)mg->mg_aliquot) / 100;
 			}
 
 			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
@@ -1488,7 +1545,7 @@ metaslab_claim_dva(spa_t *spa, const dva
 	mutex_enter(&msp->ms_lock);
 
 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
-		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
+		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
 
 	if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
 		error = ENOENT;

Modified: stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
==============================================================================
--- stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c	Mon Aug  1 08:14:16 2011	(r224564)
+++ stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c	Mon Aug  1 08:17:54 2011	(r224565)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -1674,3 +1675,9 @@ spa_scan_get_stats(spa_t *spa, pool_scan
 
 	return (0);
 }
+
+boolean_t
+spa_debug_enabled(spa_t *spa)
+{
+	return (spa->spa_debug);
+}

Modified: stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
==============================================================================
--- stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h	Mon Aug  1 08:14:16 2011	(r224564)
+++ stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h	Mon Aug  1 08:17:54 2011	(r224565)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_METASLAB_H
@@ -47,6 +48,8 @@ extern void metaslab_sync_reassess(metas
 #define	METASLAB_HINTBP_FAVOR	0x0
 #define	METASLAB_HINTBP_AVOID	0x1
 #define	METASLAB_GANG_HEADER	0x2
+#define	METASLAB_GANG_CHILD	0x4
+#define	METASLAB_GANG_AVOID	0x8
 
 extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
     blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags);

Modified: stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
==============================================================================
--- stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h	Mon Aug  1 08:14:16 2011	(r224564)
+++ stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h	Mon Aug  1 08:17:54 2011	(r224565)
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_METASLAB_IMPL_H
@@ -52,6 +53,7 @@ struct metaslab_group {
 	avl_tree_t		mg_metaslab_tree;
 	uint64_t		mg_aliquot;
 	uint64_t		mg_bonus_area;
+	uint64_t		mg_alloc_failures;
 	int64_t			mg_bias;
 	int64_t			mg_activation_count;
 	metaslab_class_t	*mg_class;

Modified: stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
==============================================================================
--- stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h	Mon Aug  1 08:14:16 2011	(r224564)
+++ stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h	Mon Aug  1 08:17:54 2011	(r224565)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_SPA_H
@@ -698,6 +699,13 @@ _NOTE(CONSTCOND) } while (0)
 #define	dprintf_bp(bp, fmt, ...)
 #endif
 
+extern boolean_t spa_debug_enabled(spa_t *spa);
+#define	spa_dbgmsg(spa, ...)			\
+{						\
+	if (spa_debug_enabled(spa))		\
+		zfs_dbgmsg(__VA_ARGS__);	\
+}
+
 extern int spa_mode_global;			/* mode, e.g. FREAD | FWRITE */
 
 #ifdef	__cplusplus

Modified: stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
==============================================================================
--- stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h	Mon Aug  1 08:14:16 2011	(r224564)
+++ stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h	Mon Aug  1 08:17:54 2011	(r224565)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_SPA_IMPL_H
@@ -196,6 +197,7 @@ struct spa {
 	kcondvar_t	spa_suspend_cv;		/* notification of resume */
 	uint8_t		spa_suspended;		/* pool is suspended */
 	uint8_t		spa_claiming;		/* pool is doing zil_claim() */
+	boolean_t	spa_debug;		/* debug enabled? */
 	boolean_t	spa_is_root;		/* pool is root */
 	int		spa_minref;		/* num refs when first opened */
 	int		spa_mode;		/* FREAD | FWRITE */

Modified: stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
==============================================================================
--- stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c	Mon Aug  1 08:14:16 2011	(r224564)
+++ stable/8/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c	Mon Aug  1 08:17:54 2011	(r224565)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -85,6 +86,7 @@ kmem_cache_t *zio_data_buf_cache[SPA_MAX
 #ifdef _KERNEL
 extern vmem_t *zio_alloc_arena;
 #endif
+extern int zfs_mg_alloc_failures;
 
 /*
  * An allocating zio is one that either currently has the DVA allocate
@@ -160,6 +162,15 @@ zio_init(void)
 			zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 	}
 
+	/*
+	 * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs
+	 * to fail 3 times per txg or 8 failures, whichever is greater.
+	 */
+	if (zfs_mg_alloc_failures == 0)
+		zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
+	else if (zfs_mg_alloc_failures < 8)
+		zfs_mg_alloc_failures = 8;
+
 	zio_inject_init();
 }
 
@@ -2135,6 +2146,7 @@ zio_dva_allocate(zio_t *zio)
 	metaslab_class_t *mc = spa_normal_class(spa);
 	blkptr_t *bp = zio->io_bp;
 	int error;
+	int flags = 0;
 
 	if (zio->io_gang_leader == NULL) {
 		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
@@ -2147,10 +2159,21 @@ zio_dva_allocate(zio_t *zio)
 	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
+	/*
+	 * The dump device does not support gang blocks so allocation on
+	 * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
+	 * the "fast" gang feature.
+	 */
+	flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
+	flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
+	    METASLAB_GANG_CHILD : 0;
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
-	    zio->io_prop.zp_copies, zio->io_txg, NULL, 0);
+	    zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
 
 	if (error) {
+		spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
+		    "size %llu, error %d", spa_name(spa), zio, zio->io_size,
+		    error);
 		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
 			return (zio_write_gang_block(zio));
 		zio->io_error = error;



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201108010817.p718Hs5I084439>