Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 21 Apr 2017 00:12:47 +0000 (UTC)
From:      Josh Paetzel <jpaetzel@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r317235 - in head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs: . sys
Message-ID:  <201704210012.v3L0ClUK077476@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: jpaetzel
Date: Fri Apr 21 00:12:47 2017
New Revision: 317235
URL: https://svnweb.freebsd.org/changeset/base/317235

Log:
  MFV 316868
  
  7430 Backfill metadnode more intelligently
  
  illumos/illumos-gate@af346df58864e8fe897b1ff1a3a4c12f9294391b
  https://github.com/illumos/illumos-gate/commit/af346df58864e8fe897b1ff1a3a4c12f9294391b
  
  https://www.illumos.org/issues/7430
    Description and patch from brought over from the following ZoL commit: https://
    github.com/zfsonlinux/zfs/commit/68cbd56e182ab949f58d004778d463aeb3f595c6
    Only attempt to backfill lower metadnode object numbers if at least
    4096 objects have been freed since the last rescan, and at most once
    per transaction group. This avoids a pathology in dmu_object_alloc()
    that caused O(N^2) behavior for create-heavy workloads and
    substantially improves object creation rates. As summarized by
    @mahrens in #4636:
    "Normally, the object allocator simply checks to see if the next
    object is available. The slow calls happened when dmu_object_alloc()
    checks to see if it can backfill lower object numbers. This happens
    every time we move on to a new L1 indirect block (i.e. every 32 *
    128 = 4096 objects). When re-checking lower object numbers, we use
    the on-disk fill count (blkptr_t:blk_fill) to quickly skip over
    indirect blocks that don?t have enough free dnodes (defined as an L2
    with at least 393,216 of 524,288 dnodes free). Therefore, we may
    find that a block of dnodes has a low (or zero) fill count, and yet
    we can?t allocate any of its dnodes, because they've been allocated
    in memory but not yet written to disk. In this case we have to hold
    each of the dnodes and then notice that it has been allocated in
    memory.
    The end result is that allocating N objects in the same TXG can
    require CPU usage proportional to N^2."
    Add a tunable dmu_rescan_dnode_threshold to define the number of
    objects that must be freed before a rescan is performed. Don't bother
    to export this as a module option because testing doesn't show a
    compelling reason to change it. The vast majority of the performance
    gain comes from limit the rescan to at most once per TXG.
  
  Reviewed by: Alek Pinchuk <alek@nexenta.com>
  Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
  Reviewed by: Matthew Ahrens <mahrens@delphix.com>
  Approved by: Gordon Ross <gordon.w.ross@gmail.com>
  Author: Ned Bass <bass6@llnl.gov>
  
  Obtained from:	Illumos

Modified:
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
Directory Properties:
  head/sys/cddl/contrib/opensolaris/   (props changed)

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c	Fri Apr 21 00:00:23 2017	(r317234)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c	Fri Apr 21 00:12:47 2017	(r317235)
@@ -36,20 +36,22 @@ dmu_object_alloc(objset_t *os, dmu_objec
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
 	uint64_t object;
-	uint64_t L2_dnode_count = DNODES_PER_BLOCK <<
+	uint64_t L1_dnode_count = DNODES_PER_BLOCK <<
 	    (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT);
 	dnode_t *dn = NULL;
-	int restarted = B_FALSE;
 
 	mutex_enter(&os->os_obj_lock);
 	for (;;) {
 		object = os->os_obj_next;
 		/*
-		 * Each time we polish off an L2 bp worth of dnodes
-		 * (2^13 objects), move to another L2 bp that's still
-		 * reasonably sparse (at most 1/4 full).  Look from the
-		 * beginning once, but after that keep looking from here.
-		 * If we can't find one, just keep going from here.
+		 * Each time we polish off a L1 bp worth of dnodes (2^12
+		 * objects), move to another L1 bp that's still reasonably
+		 * sparse (at most 1/4 full). Look from the beginning at most
+		 * once per txg, but after that keep looking from here.
+		 * os_scan_dnodes is set during txg sync if enough objects
+		 * have been freed since the previous rescan to justify
+		 * backfilling again. If we can't find a suitable block, just
+		 * keep going from here.
 		 *
 		 * Note that dmu_traverse depends on the behavior that we use
 		 * multiple blocks of the dnode object before going back to
@@ -57,12 +59,19 @@ dmu_object_alloc(objset_t *os, dmu_objec
 		 * that property or find another solution to the issues
 		 * described in traverse_visitbp.
 		 */
-		if (P2PHASE(object, L2_dnode_count) == 0) {
-			uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
-			int error = dnode_next_offset(DMU_META_DNODE(os),
+
+		if (P2PHASE(object, L1_dnode_count) == 0) {
+			uint64_t offset;
+			int error;
+			if (os->os_rescan_dnodes) {
+				offset = 0;
+				os->os_rescan_dnodes = B_FALSE;
+			} else {
+				offset = object << DNODE_SHIFT;
+			}
+			error = dnode_next_offset(DMU_META_DNODE(os),
 			    DNODE_FIND_HOLE,
 			    &offset, 2, DNODES_PER_BLOCK >> 2, 0);
-			restarted = B_TRUE;
 			if (error == 0)
 				object = offset >> DNODE_SHIFT;
 		}

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	Fri Apr 21 00:00:23 2017	(r317234)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	Fri Apr 21 00:12:47 2017	(r317235)
@@ -67,6 +67,13 @@ krwlock_t os_lock;
  */
 int dmu_find_threads = 0;
 
+/*
+ * Backfill lower metadnode objects after this many have been freed.
+ * Backfilling negatively impacts object creation rates, so only do it
+ * if there are enough holes to fill.
+ */
+int dmu_rescan_dnode_threshold = 131072;
+
 static void dmu_objset_find_dp_cb(void *arg);
 
 void
@@ -1176,6 +1183,13 @@ dmu_objset_sync(objset_t *os, zio_t *pio
 		if (dr->dr_zio)
 			zio_nowait(dr->dr_zio);
 	}
+
+	/* Enable dnode backfill if enough objects have been freed. */
+	if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
+		os->os_rescan_dnodes = B_TRUE;
+		os->os_freed_dnodes = 0;
+	}
+
 	/*
 	 * Free intent log blocks up to this tx.
 	 */

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c	Fri Apr 21 00:00:23 2017	(r317234)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c	Fri Apr 21 00:12:47 2017	(r317235)
@@ -672,6 +672,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 	}
 
 	if (freeing_dnode) {
+		dn->dn_objset->os_freed_dnodes++;
 		dnode_sync_free(dn, tx);
 		return;
 	}

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h	Fri Apr 21 00:00:23 2017	(r317234)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h	Fri Apr 21 00:12:47 2017	(r317235)
@@ -112,6 +112,8 @@ struct objset {
 	zil_header_t os_zil_header;
 	list_t os_synced_dnodes;
 	uint64_t os_flags;
+	uint64_t os_freed_dnodes;
+	boolean_t os_rescan_dnodes;
 
 	/* Protected by os_obj_lock */
 	kmutex_t os_obj_lock;



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201704210012.v3L0ClUK077476>