Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 5 Sep 2013 17:55:15 +0000 (UTC)
From:      Xin LI <delphij@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org
Subject:   svn commit: r255256 - in vendor-sys/illumos/dist/uts/common/fs/zfs: . sys
Message-ID:  <201309051755.r85HtFud093886@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: delphij
Date: Thu Sep  5 17:55:14 2013
New Revision: 255256
URL: http://svnweb.freebsd.org/changeset/base/255256

Log:
  Update vendor-sys/illumos/dist to 14171:98413c8cf54d:
  
  Illumos ZFS issues:
    3954 metaslabs continue to load even after hitting zfs_mg_alloc_failure limit
    4080 zpool clear fails to clear pool
    4081 need zfs_mg_noalloc_threshold

Modified:
  vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab_impl.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ioctl.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c	Thu Sep  5 17:52:54 2013	(r255255)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c	Thu Sep  5 17:55:14 2013	(r255256)
@@ -58,9 +58,25 @@ int zfs_condense_pct = 200;
 /*
  * This value defines the number of allowed allocation failures per vdev.
  * If a device reaches this threshold in a given txg then we consider skipping
- * allocations on that device.
+ * allocations on that device. The value of zfs_mg_alloc_failures is computed
+ * in zio_init() unless it has been overridden in /etc/system.
  */
-int zfs_mg_alloc_failures;
+int zfs_mg_alloc_failures = 0;
+
+/*
+ * The zfs_mg_noalloc_threshold defines which metaslab groups should
+ * be eligible for allocation. The value is defined as a percentage of
+ * a free space. Metaslab groups that have more free space than
+ * zfs_mg_noalloc_threshold are always eligible for allocations. Once
+ * a metaslab group's free space is less than or equal to the
+ * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
+ * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
+ * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
+ * groups are allowed to accept allocations. Gang blocks are always
+ * eligible to allocate on any metaslab group. The default value of 0 means
+ * no metaslab group will be excluded based on this criterion.
+ */
+int zfs_mg_noalloc_threshold = 0;
 
 /*
  * Metaslab debugging: when set, keeps all space maps in core to verify frees.
@@ -224,6 +240,53 @@ metaslab_compare(const void *x1, const v
 	return (0);
 }
 
+/*
+ * Update the allocatable flag and the metaslab group's capacity.
+ * The allocatable flag is set to true if the capacity is below
+ * the zfs_mg_noalloc_threshold. If a metaslab group transitions
+ * from allocatable to non-allocatable or vice versa then the metaslab
+ * group's class is updated to reflect the transition.
+ */
+static void
+metaslab_group_alloc_update(metaslab_group_t *mg)
+{
+	vdev_t *vd = mg->mg_vd;
+	metaslab_class_t *mc = mg->mg_class;
+	vdev_stat_t *vs = &vd->vdev_stat;
+	boolean_t was_allocatable;
+
+	ASSERT(vd == vd->vdev_top);
+
+	mutex_enter(&mg->mg_lock);
+	was_allocatable = mg->mg_allocatable;
+
+	mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
+	    (vs->vs_space + 1);
+
+	mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold);
+
+	/*
+	 * The mc_alloc_groups maintains a count of the number of
+	 * groups in this metaslab class that are still above the
+	 * zfs_mg_noalloc_threshold. This is used by the allocating
+	 * threads to determine if they should avoid allocations to
+	 * a given group. The allocator will avoid allocations to a group
+	 * if that group has reached or is below the zfs_mg_noalloc_threshold
+	 * and there are still other groups that are above the threshold.
+	 * When a group transitions from allocatable to non-allocatable or
+	 * vice versa we update the metaslab class to reflect that change.
+	 * When the mc_alloc_groups value drops to 0 that means that all
+	 * groups have reached the zfs_mg_noalloc_threshold making all groups
+	 * eligible for allocations. This effectively means that all devices
+	 * are balanced again.
+	 */
+	if (was_allocatable && !mg->mg_allocatable)
+		mc->mc_alloc_groups--;
+	else if (!was_allocatable && mg->mg_allocatable)
+		mc->mc_alloc_groups++;
+	mutex_exit(&mg->mg_lock);
+}
+
 metaslab_group_t *
 metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 {
@@ -274,6 +337,7 @@ metaslab_group_activate(metaslab_group_t
 		return;
 
 	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
+	metaslab_group_alloc_update(mg);
 
 	if ((mgprev = mc->mc_rotor) == NULL) {
 		mg->mg_prev = mg;
@@ -359,6 +423,29 @@ metaslab_group_sort(metaslab_group_t *mg
 }
 
 /*
+ * Determine if a given metaslab group should skip allocations. A metaslab
+ * group should avoid allocations if its used capacity has crossed the
+ * zfs_mg_noalloc_threshold and there is at least one metaslab group
+ * that can still handle allocations.
+ */
+static boolean_t
+metaslab_group_allocatable(metaslab_group_t *mg)
+{
+	vdev_t *vd = mg->mg_vd;
+	spa_t *spa = vd->vdev_spa;
+	metaslab_class_t *mc = mg->mg_class;
+
+	/*
+	 * A metaslab group is considered allocatable if its free capacity
+	 * is greater than the set value of zfs_mg_noalloc_threshold, it's
+	 * associated with a slog, or there are no other metaslab groups
+	 * with free capacity greater than zfs_mg_noalloc_threshold.
+	 */
+	return (mg->mg_free_capacity > zfs_mg_noalloc_threshold ||
+	    mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0);
+}
+
+/*
  * ==========================================================================
  * Common allocator routines
  * ==========================================================================
@@ -1307,6 +1394,8 @@ metaslab_sync_reassess(metaslab_group_t 
 	vdev_t *vd = mg->mg_vd;
 	int64_t failures = mg->mg_alloc_failures;
 
+	metaslab_group_alloc_update(mg);
+
 	/*
 	 * Re-evaluate all metaslabs which have lower offsets than the
 	 * bonus area.
@@ -1408,6 +1497,8 @@ metaslab_group_alloc(metaslab_group_t *m
 		if (msp == NULL)
 			return (-1ULL);
 
+		mutex_enter(&msp->ms_lock);
+
 		/*
 		 * If we've already reached the allowable number of failed
 		 * allocation attempts on this metaslab group then we
@@ -1424,11 +1515,10 @@ metaslab_group_alloc(metaslab_group_t *m
 			    "asize %llu, failures %llu", spa_name(spa),
 			    mg->mg_vd->vdev_id, txg, mg, psize, asize,
 			    mg->mg_alloc_failures);
+			mutex_exit(&msp->ms_lock);
 			return (-1ULL);
 		}
 
-		mutex_enter(&msp->ms_lock);
-
 		/*
 		 * Ensure that the metaslab we have selected is still
 		 * capable of handling our request. It's possible that
@@ -1581,6 +1671,21 @@ top:
 		} else {
 			allocatable = vdev_allocatable(vd);
 		}
+
+		/*
+		 * Determine if the selected metaslab group is eligible
+		 * for allocations. If we're ganging or have requested
+		 * an allocation for the smallest gang block size
+		 * then we don't want to avoid allocating to the this
+		 * metaslab group. If we're in this condition we should
+		 * try to allocate from any device possible so that we
+		 * don't inadvertently return ENOSPC and suspend the pool
+		 * even though space is still available.
+		 */
+		if (allocatable && CAN_FASTGANG(flags) &&
+		    psize > SPA_GANGBLOCKSIZE)
+			allocatable = metaslab_group_allocatable(mg);
+
 		if (!allocatable)
 			goto next;
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab_impl.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab_impl.h	Thu Sep  5 17:52:54 2013	(r255255)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab_impl.h	Thu Sep  5 17:55:14 2013	(r255256)
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_METASLAB_IMPL_H
@@ -45,6 +45,7 @@ struct metaslab_class {
 	metaslab_group_t	*mc_rotor;
 	space_map_ops_t		*mc_ops;
 	uint64_t		mc_aliquot;
+	uint64_t		mc_alloc_groups; /* # of allocatable groups */
 	uint64_t		mc_alloc;	/* total allocated space */
 	uint64_t		mc_deferred;	/* total deferred frees */
 	uint64_t		mc_space;	/* total space (alloc + free) */
@@ -57,6 +58,8 @@ struct metaslab_group {
 	uint64_t		mg_aliquot;
 	uint64_t		mg_bonus_area;
 	uint64_t		mg_alloc_failures;
+	boolean_t		mg_allocatable;		/* can we allocate? */
+	uint64_t		mg_free_capacity;	/* percentage free */
 	int64_t			mg_bias;
 	int64_t			mg_activation_count;
 	metaslab_class_t	*mg_class;

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ioctl.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ioctl.c	Thu Sep  5 17:52:54 2013	(r255255)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_ioctl.c	Thu Sep  5 17:55:14 2013	(r255256)
@@ -5405,7 +5405,7 @@ zfs_ioctl_init(void)
 	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
 
 	zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear,
-	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED);
+	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
 	zfs_ioctl_register_pool(ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen,
 	    zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED);
 

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c	Thu Sep  5 17:52:54 2013	(r255255)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c	Thu Sep  5 17:55:14 2013	(r255256)
@@ -171,7 +171,8 @@ zio_init(void)
 	 * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs
 	 * to fail 3 times per txg or 8 failures, whichever is greater.
 	 */
-	zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
+	if (zfs_mg_alloc_failures == 0)
+		zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
 
 	zio_inject_init();
 }
@@ -2365,7 +2366,7 @@ zio_alloc_zil(spa_t *spa, uint64_t txg, 
 	if (error) {
 		error = metaslab_alloc(spa, spa_normal_class(spa), size,
 		    new_bp, 1, txg, old_bp,
-		    METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
+		    METASLAB_HINTBP_AVOID);
 	}
 
 	if (error == 0) {



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201309051755.r85HtFud093886>