FreeBSD Mail Archives

Date:      Fri, 11 Oct 2019 14:59:28 +0000 (UTC)
From:      Alan Somers <asomers@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r353439 - head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs
Message-ID:  <201910111459.x9BExSGP021208@repo.freebsd.org>

index | next in thread | raw e-mail


Author: asomers
Date: Fri Oct 11 14:59:28 2019
New Revision: 353439
URL: https://svnweb.freebsd.org/changeset/base/353439

Log:
  MFZol: Fix performance of "zfs recv" with many deletions
  
  This patch fixes 2 issues with the DMU free throttle implemented
  in dmu_free_long_range(). The first issue is that get_next_chunk()
  was calculating the number of L1 blocks the free would dirty
  incorrectly. In some cases involving extremely large files, this
  code would greatly overestimate the number of affected L1 blocks,
  causing excessive calls to txg_wait_open(). This patch corrects
  the calculation.
  
  The second issue is that the free throttle uses the total number
  of free'd blocks in all (open, quiescing, and syncing) txgs to
  determine whether to throttle. This causes large frees (such as
  those created by the first issue) to cause 4 txg syncs before
  any further frees were allowed to proceed. This patch ensures
  that the accounting is done entirely in a per-txg fashion, so
  that frees from a given txg don't affect those that immediately
  follow it.
  
  Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
  Reviewed-by: Matthew Ahrens <mahrens@delphix.com>
  Signed-off-by: Tom Caputi <tcaputi@datto.com>
  zfsonlinux/zfs@f4c594da94d856c422512a54e48070f890b2685b
  
  Freeing throttle should account for holes
  
  Deletion throttle currently does not account for holes in a file.
  This means that it can activate when it shouldn't.
  To fix it we switch the throttle to be based on the number of
  L1 blocks we will have to dirty when freeing
  
  Reviewed-by: Tom Caputi <tcaputi@datto.com>
  Reviewed-by: Matt Ahrens <mahrens@delphix.com>
  Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
  Signed-off-by: Alek Pinchuk <apinchuk@datto.com>
  zfsonlinux/zfs@65282ee9e06b130f1f0169baf5d9bf0dd8fc1ef9
  
  Submitted by:	Alek Pinchuk <pinchuk.alek@gmail.com>
  Reviewed by:	allanjude
  MFC after:	2 weeks
  Sponsored by:	Axcient
  Differential Revision:	https://reviews.freebsd.org/D21895

Modified:
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	Fri Oct 11 14:57:47 2019	(r353438)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	Fri Oct 11 14:59:28 2019	(r353439)
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2019 Datto Inc.
  */
 /* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
 /* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
@@ -62,14 +63,15 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFL
     &zfs_nopwrite_enabled, 0, "Enable nopwrite feature");
 
 /*
- * Tunable to control percentage of dirtied blocks from frees in one TXG.
- * After this threshold is crossed, additional dirty blocks from frees
- * wait until the next TXG.
+ * Tunable to control percentage of dirtied L1 blocks from frees allowed into
+ * one TXG. After this threshold is crossed, additional dirty blocks from frees
+ * will wait until the next TXG.
  * A value of zero will disable this throttle.
  */
-uint32_t zfs_per_txg_dirty_frees_percent = 30;
+uint32_t zfs_per_txg_dirty_frees_percent = 5;
 SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN,
-	&zfs_per_txg_dirty_frees_percent, 0, "Percentage of dirtied blocks from frees in one txg");
+	&zfs_per_txg_dirty_frees_percent, 0,
+	"Percentage of dirtied indirect blocks from frees allowed in one txg");
 
 /*
  * This can be used for testing, to ensure that certain actions happen
@@ -683,11 +685,13 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t le
  *
  * On input, *start should be the first offset that does not need to be
  * freed (e.g. "offset + length").  On return, *start will be the first
- * offset that should be freed.
+ * offset that should be freed and l1blks is set to the number of level 1
+ * indirect blocks found within the chunk.
  */
 static int
-get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
+get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks)
 {
+	uint64_t blks;
 	uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
 	/* bytes of data covered by a level-1 indirect block */
 	uint64_t iblkrange =
@@ -695,13 +699,23 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t 
 
 	ASSERT3U(minimum, <=, *start);
 
-	if (*start - minimum <= iblkrange * maxblks) {
+	/*
+	 * Check if we can free the entire range assuming that all of the
+	 * L1 blocks in this range have data. If we can, we use this
+	 * worst case value as an estimate so we can avoid having to look
+	 * at the object's actual data.
+	 */
+	uint64_t total_l1blks =
+	    (roundup(*start, iblkrange) - (minimum / iblkrange * iblkrange)) /
+	    iblkrange;
+	if (total_l1blks <= maxblks) {
+		*l1blks = total_l1blks;
 		*start = minimum;
 		return (0);
 	}
 	ASSERT(ISP2(iblkrange));
 
-	for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) {
+	for (blks = 0; *start > minimum && blks < maxblks; blks++) {
 		int err;
 
 		/*
@@ -711,6 +725,7 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t 
 		 * to search.
 		 */
 		(*start)--;
+
 		err = dnode_next_offset(dn,
 		    DNODE_FIND_BACKWARDS, start, 2, 1, 0);
 
@@ -719,6 +734,7 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t 
 			*start = minimum;
 			break;
 		} else if (err != 0) {
+			*l1blks = blks;
 			return (err);
 		}
 
@@ -727,6 +743,8 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t 
 	}
 	if (*start < minimum)
 		*start = minimum;
+	*l1blks = blks;
+
 	return (0);
 }
 
@@ -762,14 +780,14 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, ui
 		dirty_frees_threshold =
 		    zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
 	else
-		dirty_frees_threshold = zfs_dirty_data_max / 4;
+		dirty_frees_threshold = zfs_dirty_data_max / 20;
 
 	if (length == DMU_OBJECT_END || offset + length > object_size)
 		length = object_size - offset;
 
 	while (length != 0) {
 		uint64_t chunk_end, chunk_begin, chunk_len;
-		uint64_t long_free_dirty_all_txgs = 0;
+		uint64_t l1blks;
 		dmu_tx_t *tx;
 
 		if (dmu_objset_zfs_unmounting(dn->dn_objset))
@@ -778,7 +796,7 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, ui
 		chunk_end = chunk_begin = offset + length;
 
 		/* move chunk_begin backwards to the beginning of this chunk */
-		err = get_next_chunk(dn, &chunk_begin, offset);
+		err = get_next_chunk(dn, &chunk_begin, offset, &l1blks);
 		if (err)
 			return (err);
 		ASSERT3U(chunk_begin, >=, offset);
@@ -786,24 +804,6 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, ui
 
 		chunk_len = chunk_end - chunk_begin;
 
-		mutex_enter(&dp->dp_lock);
-		for (int t = 0; t < TXG_SIZE; t++) {
-			long_free_dirty_all_txgs +=
-			    dp->dp_long_free_dirty_pertxg[t];
-		}
-		mutex_exit(&dp->dp_lock);
-
-		/*
-		 * To avoid filling up a TXG with just frees wait for
-		 * the next TXG to open before freeing more chunks if
-		 * we have reached the threshold of frees
-		 */
-		if (dirty_frees_threshold != 0 &&
-		    long_free_dirty_all_txgs >= dirty_frees_threshold) {
-			txg_wait_open(dp, 0);
-			continue;
-		}
-
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
 
@@ -818,13 +818,42 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, ui
 			return (err);
 		}
 
+		uint64_t txg = dmu_tx_get_txg(tx);
+
 		mutex_enter(&dp->dp_lock);
-		dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] +=
-		    chunk_len;
+		uint64_t long_free_dirty =
+		    dp->dp_long_free_dirty_pertxg[txg & TXG_MASK];
 		mutex_exit(&dp->dp_lock);
+
+		/*
+		 * To avoid filling up a TXG with just frees, wait for
+		 * the next TXG to open before freeing more chunks if
+		 * we have reached the threshold of frees.
+		 */
+		if (dirty_frees_threshold != 0 &&
+		    long_free_dirty >= dirty_frees_threshold) {
+			dmu_tx_commit(tx);
+			txg_wait_open(dp, 0);
+			continue;
+		}
+
+		/*
+		 * In order to prevent unnecessary write throttling, for each
+		 * TXG, we track the cumulative size of L1 blocks being dirtied
+		 * in dnode_free_range() below. We compare this number to a
+		 * tunable threshold, past which we prevent new L1 dirty freeing
+		 * blocks from being added into the open TXG. See
+		 * dmu_free_long_range_impl() for details. The threshold
+		 * prevents write throttle activation due to dirty freeing L1
+		 * blocks taking up a large percentage of zfs_dirty_data_max.
+		 */
+		mutex_enter(&dp->dp_lock);
+		dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] +=
+		    l1blks << dn->dn_indblkshift;
+		mutex_exit(&dp->dp_lock);
 		DTRACE_PROBE3(free__long__range,
-		    uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len,
-		    uint64_t, dmu_tx_get_txg(tx));
+		    uint64_t, long_free_dirty, uint64_t, chunk_len,
+		    uint64_t, txg);
 		dnode_free_range(dn, chunk_begin, chunk_len, tx);
 		dmu_tx_commit(tx);

home | help

Want to link to this message? Use this
URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201910111459.x9BExSGP021208>

Header And Logo

Peripheral Links

Site Navigation

Header And Logo

Peripheral Links

Search

Site Navigation