Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 25 Nov 2012 09:06:32 +0000 (UTC)
From:      Martin Matuska <mm@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r243503 - in head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs: . sys
Message-ID:  <201211250906.qAP96W4D030291@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mm
Date: Sun Nov 25 09:06:32 2012
New Revision: 243503
URL: http://svnweb.freebsd.org/changeset/base/243503

Log:
  MFV r242735:
  
  Illumos 13879:4eac7a87eff2:
  3329 spa_sync() spends 10-20% of its time in spa_free_sync_cb()
  3330 space_seg_t should have its own kmem_cache
  3331 deferred frees should happen after sync_pass 1
  3335 make SYNC_PASS_* constants tunable
  
  New loader-only tunables:
  vfs.zfs.sync_pass_deferred_free
  vfs.zfs.sync_pass_dont_compress
  vfs.zfs.sync_pass_rewrite
  
  References:
  https://www.illumos.org/issues/3329
  https://www.illumos.org/issues/3330
  https://www.illumos.org/issues/3331
  https://www.illumos.org/issues/3335
  
  MFC after:	2 weeks

Modified:
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
Directory Properties:
  head/sys/cddl/contrib/opensolaris/   (props changed)

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c	Sat Nov 24 13:23:15 2012	(r243502)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c	Sun Nov 25 09:06:32 2012	(r243503)
@@ -881,8 +881,9 @@ metaslab_activate(metaslab_t *msp, uint6
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
 		space_map_load_wait(sm);
 		if (!sm->sm_loaded) {
-			int error = space_map_load(sm, sm_ops, SM_FREE,
-			    &msp->ms_smo,
+			space_map_obj_t *smo = &msp->ms_smo;
+
+			int error = space_map_load(sm, sm_ops, SM_FREE, smo,
 			    spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
 			if (error)  {
 				metaslab_group_sort(msp->ms_group, msp, 0);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	Sat Nov 24 13:23:15 2012	(r243502)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	Sun Nov 25 09:06:32 2012	(r243503)
@@ -139,6 +139,7 @@ boolean_t	zio_taskq_sysdc = B_TRUE;	/* u
 uint_t		zio_taskq_basedc = 80;		/* base duty cycle */
 
 boolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
+extern int	zfs_sync_pass_deferred_free;
 
 /*
  * This (illegal) pool name is used when temporarily importing a spa_t in order
@@ -6302,7 +6303,7 @@ spa_sync(spa_t *spa, uint64_t txg)
 		spa_errlog_sync(spa, txg);
 		dsl_pool_sync(dp, txg);
 
-		if (pass <= SYNC_PASS_DEFERRED_FREE) {
+		if (pass < zfs_sync_pass_deferred_free) {
 			zio_t *zio = zio_root(spa, NULL, NULL, 0);
 			bplist_iterate(free_bpl, spa_free_sync_cb,
 			    zio, tx);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c	Sat Nov 24 13:23:15 2012	(r243502)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c	Sun Nov 25 09:06:32 2012	(r243503)
@@ -1619,6 +1619,7 @@ spa_init(int mode)
 #endif /* illumos */
 	refcount_sysinit();
 	unique_init();
+	space_map_init();
 	zio_init();
 	dmu_init();
 	zil_init();
@@ -1641,6 +1642,7 @@ spa_fini(void)
 	zil_fini();
 	dmu_fini();
 	zio_fini();
+	space_map_fini();
 	unique_fini();
 	refcount_fini();
 

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c	Sat Nov 24 13:23:15 2012	(r243502)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c	Sun Nov 25 09:06:32 2012	(r243503)
@@ -39,6 +39,23 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map
     &space_map_last_hope, 0,
     "If kernel panic in space_map code on pool import, import the pool in readonly mode and backup all your data before trying this option.");
 
+static kmem_cache_t *space_seg_cache;
+
+void
+space_map_init(void)
+{
+	ASSERT(space_seg_cache == NULL);
+	space_seg_cache = kmem_cache_create("space_seg_cache",
+	    sizeof (space_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+space_map_fini(void)
+{
+	kmem_cache_destroy(space_seg_cache);
+	space_seg_cache = NULL;
+}
+
 /*
  * Space map routines.
  * NOTE: caller is responsible for all locking.
@@ -148,7 +165,7 @@ again:
 			avl_remove(sm->sm_pp_root, ss_after);
 		}
 		ss_after->ss_start = ss_before->ss_start;
-		kmem_free(ss_before, sizeof (*ss_before));
+		kmem_cache_free(space_seg_cache, ss_before);
 		ss = ss_after;
 	} else if (merge_before) {
 		ss_before->ss_end = end;
@@ -161,7 +178,7 @@ again:
 			avl_remove(sm->sm_pp_root, ss_after);
 		ss = ss_after;
 	} else {
-		ss = kmem_alloc(sizeof (*ss), KM_SLEEP);
+		ss = kmem_cache_alloc(space_seg_cache, KM_SLEEP);
 		ss->ss_start = start;
 		ss->ss_end = end;
 		avl_insert(&sm->sm_root, ss, where);
@@ -207,7 +224,7 @@ space_map_remove(space_map_t *sm, uint64
 		avl_remove(sm->sm_pp_root, ss);
 
 	if (left_over && right_over) {
-		newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP);
+		newseg = kmem_cache_alloc(space_seg_cache, KM_SLEEP);
 		newseg->ss_start = end;
 		newseg->ss_end = ss->ss_end;
 		ss->ss_end = start;
@@ -220,7 +237,7 @@ space_map_remove(space_map_t *sm, uint64
 		ss->ss_start = end;
 	} else {
 		avl_remove(&sm->sm_root, ss);
-		kmem_free(ss, sizeof (*ss));
+		kmem_cache_free(space_seg_cache, ss);
 		ss = NULL;
 	}
 
@@ -260,7 +277,7 @@ space_map_vacate(space_map_t *sm, space_
 	while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
 		if (func != NULL)
 			func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
-		kmem_free(ss, sizeof (*ss));
+		kmem_cache_free(space_seg_cache, ss);
 	}
 	sm->sm_space = 0;
 }
@@ -432,7 +449,7 @@ space_map_sync(space_map_t *sm, uint8_t 
 	spa_t *spa = dmu_objset_spa(os);
 	void *cookie = NULL;
 	space_seg_t *ss;
-	uint64_t bufsize, start, size, run_len;
+	uint64_t bufsize, start, size, run_len, delta, sm_space;
 	uint64_t *entry, *entry_map, *entry_map_end;
 
 	ASSERT(MUTEX_HELD(sm->sm_lock));
@@ -461,11 +478,13 @@ space_map_sync(space_map_t *sm, uint8_t 
 	    SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) |
 	    SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
 
+	delta = 0;
+	sm_space = sm->sm_space;
 	while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
 		size = ss->ss_end - ss->ss_start;
 		start = (ss->ss_start - sm->sm_start) >> sm->sm_shift;
 
-		sm->sm_space -= size;
+		delta += size;
 		size >>= sm->sm_shift;
 
 		while (size) {
@@ -487,7 +506,7 @@ space_map_sync(space_map_t *sm, uint8_t 
 			start += run_len;
 			size -= run_len;
 		}
-		kmem_free(ss, sizeof (*ss));
+		kmem_cache_free(space_seg_cache, ss);
 	}
 
 	if (entry != entry_map) {
@@ -499,8 +518,15 @@ space_map_sync(space_map_t *sm, uint8_t 
 		smo->smo_objsize += size;
 	}
 
+	/*
+	 * Ensure that the space_map's accounting wasn't changed
+	 * while we were in the middle of writing it out.
+	 */
+	VERIFY3U(sm->sm_space, ==, sm_space);
+
 	zio_buf_free(entry_map, bufsize);
 
+	sm->sm_space -= delta;
 	VERIFY0(sm->sm_space);
 }
 

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h	Sat Nov 24 13:23:15 2012	(r243502)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h	Sun Nov 25 09:06:32 2012	(r243503)
@@ -21,7 +21,10 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
- * Copyright (c) 2011 by Delphix. All rights reserved.
+ */
+
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_METASLAB_IMPL_H

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h	Sat Nov 24 13:23:15 2012	(r243502)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h	Sun Nov 25 09:06:32 2012	(r243503)
@@ -489,14 +489,6 @@ extern int spa_scan_stop(spa_t *spa);
 extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
 extern void spa_sync_allpools(void);
 
-/*
- * DEFERRED_FREE must be large enough that regular blocks are not
- * deferred.  XXX so can't we change it back to 1?
- */
-#define	SYNC_PASS_DEFERRED_FREE	2	/* defer frees after this pass */
-#define	SYNC_PASS_DONT_COMPRESS	4	/* don't compress after this pass */
-#define	SYNC_PASS_REWRITE	1	/* rewrite new bps after this pass */
-
 /* spa namespace global mutex */
 extern kmutex_t spa_namespace_lock;
 

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h	Sat Nov 24 13:23:15 2012	(r243502)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h	Sun Nov 25 09:06:32 2012	(r243503)
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
 #ifndef _SYS_SPACE_MAP_H
 #define	_SYS_SPACE_MAP_H
 
@@ -136,6 +140,8 @@ struct space_map_ops {
 
 typedef void space_map_func_t(space_map_t *sm, uint64_t start, uint64_t size);
 
+extern void space_map_init(void);
+extern void space_map_fini(void);
 extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size,
     uint8_t shift, kmutex_t *lp);
 extern void space_map_destroy(space_map_t *sm);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h	Sat Nov 24 13:23:15 2012	(r243502)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h	Sun Nov 25 09:06:32 2012	(r243503)
@@ -23,6 +23,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2012 by Delphix. All rights reserved.
+ */
+
 #ifndef _ZIO_IMPL_H
 #define	_ZIO_IMPL_H
 
@@ -143,6 +147,7 @@ enum zio_stage {
 #define	ZIO_FREE_PIPELINE			\
 	(ZIO_INTERLOCK_STAGES |			\
 	ZIO_STAGE_FREE_BP_INIT |		\
+	ZIO_STAGE_ISSUE_ASYNC |			\
 	ZIO_STAGE_DVA_FREE |			\
 	ZIO_STAGE_VDEV_IO_START |		\
 	ZIO_STAGE_VDEV_IO_ASSESS)

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c	Sat Nov 24 13:23:15 2012	(r243502)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c	Sun Nov 25 09:06:32 2012	(r243503)
@@ -107,6 +107,31 @@ extern vmem_t *zio_alloc_arena;
 extern int zfs_mg_alloc_failures;
 
 /*
+ * The following actions directly effect the spa's sync-to-convergence logic.
+ * The values below define the sync pass when we start performing the action.
+ * Care should be taken when changing these values as they directly impact
+ * spa_sync() performance. Tuning these values may introduce subtle performance
+ * pathologies and should only be done in the context of performance analysis.
+ * These tunables will eventually be removed and replaced with #defines once
+ * enough analysis has been done to determine optimal values.
+ *
+ * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
+ * regular blocks are not deferred.
+ */
+int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
+TUNABLE_INT("vfs.zfs.sync_pass_deferred_free", &zfs_sync_pass_deferred_free);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_deferred_free, CTLFLAG_RDTUN,
+    &zfs_sync_pass_deferred_free, 0, "defer frees starting in this pass");
+int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
+TUNABLE_INT("vfs.zfs.sync_pass_dont_compress", &zfs_sync_pass_dont_compress);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_dont_compress, CTLFLAG_RDTUN,
+    &zfs_sync_pass_dont_compress, 0, "don't compress starting in this pass");
+int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
+TUNABLE_INT("vfs.zfs.sync_pass_rewrite", &zfs_sync_pass_rewrite);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, sync_pass_rewrite, CTLFLAG_RDTUN,
+    &zfs_sync_pass_rewrite, 0, "rewrite new bps starting in this pass");
+
+/*
  * An allocating zio is one that either currently has the DVA allocate
  * stage set or will have it later in its lifetime.
  */
@@ -742,7 +767,7 @@ zio_free_sync(zio_t *pio, spa_t *spa, ui
 
 	ASSERT(!BP_IS_HOLE(bp));
 	ASSERT(spa_syncing_txg(spa) == txg);
-	ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE);
+	ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
 
 	zio = zio_create(pio, spa, txg, bp, NULL, size,
 	    NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
@@ -1051,7 +1076,7 @@ zio_write_bp_init(zio_t *zio)
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 		ASSERT(!BP_GET_DEDUP(bp));
 
-		if (pass > SYNC_PASS_DONT_COMPRESS)
+		if (pass >= zfs_sync_pass_dont_compress)
 			compress = ZIO_COMPRESS_OFF;
 
 		/* Make sure someone doesn't change their mind on overwrites */
@@ -1080,7 +1105,7 @@ zio_write_bp_init(zio_t *zio)
 	 * There should only be a handful of blocks after pass 1 in any case.
 	 */
 	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize &&
-	    pass > SYNC_PASS_REWRITE) {
+	    pass >= zfs_sync_pass_rewrite) {
 		ASSERT(psize != 0);
 		enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
 		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201211250906.qAP96W4D030291>