Date: Fri, 11 Oct 2019 14:59:28 +0000 (UTC) From: Alan Somers <asomers@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r353439 - head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs Message-ID: <201910111459.x9BExSGP021208@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: asomers Date: Fri Oct 11 14:59:28 2019 New Revision: 353439 URL: https://svnweb.freebsd.org/changeset/base/353439 Log: MFZol: Fix performance of "zfs recv" with many deletions This patch fixes 2 issues with the DMU free throttle implemented in dmu_free_long_range(). The first issue is that get_next_chunk() was calculating the number of L1 blocks the free would dirty incorrectly. In some cases involving extremely large files, this code would greatly overestimate the number of affected L1 blocks, causing excessive calls to txg_wait_open(). This patch corrects the calculation. The second issue is that the free throttle uses the total number of free'd blocks in all (open, quiescing, and syncing) txgs to determine whether to throttle. This causes large frees (such as those created by the first issue) to cause 4 txg syncs before any further frees were allowed to proceed. This patch ensures that the accounting is done entirely in a per-txg fashion, so that frees from a given txg don't affect those that immediately follow it. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Signed-off-by: Tom Caputi <tcaputi@datto.com> zfsonlinux/zfs@f4c594da94d856c422512a54e48070f890b2685b Freeing throttle should account for holes Deletion throttle currently does not account for holes in a file. This means that it can activate when it shouldn't. To fix it we switch the throttle to be based on the number of L1 blocks we will have to dirty when freeing Reviewed-by: Tom Caputi <tcaputi@datto.com> Reviewed-by: Matt Ahrens <mahrens@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Alek Pinchuk <apinchuk@datto.com> zfsonlinux/zfs@65282ee9e06b130f1f0169baf5d9bf0dd8fc1ef9 Submitted by: Alek Pinchuk <pinchuk.alek@gmail.com> Reviewed by: allanjude MFC after: 2 weeks Sponsored by: Axcient Differential Revision: https://reviews.freebsd.org/D21895 Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c Fri Oct 11 14:57:47 2019 (r353438) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c Fri Oct 11 14:59:28 2019 (r353439) @@ -21,6 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2019 Datto Inc. */ /* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ /* Copyright (c) 2013, Joyent, Inc. All rights reserved. */ @@ -62,14 +63,15 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFL &zfs_nopwrite_enabled, 0, "Enable nopwrite feature"); /* - * Tunable to control percentage of dirtied blocks from frees in one TXG. - * After this threshold is crossed, additional dirty blocks from frees - * wait until the next TXG. + * Tunable to control percentage of dirtied L1 blocks from frees allowed into + * one TXG. After this threshold is crossed, additional dirty blocks from frees + * will wait until the next TXG. * A value of zero will disable this throttle. */ -uint32_t zfs_per_txg_dirty_frees_percent = 30; +uint32_t zfs_per_txg_dirty_frees_percent = 5; SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN, - &zfs_per_txg_dirty_frees_percent, 0, "Percentage of dirtied blocks from frees in one txg"); + &zfs_per_txg_dirty_frees_percent, 0, + "Percentage of dirtied indirect blocks from frees allowed in one txg"); /* * This can be used for testing, to ensure that certain actions happen @@ -683,11 +685,13 @@ dmu_prefetch(objset_t *os, uint64_t object, int64_t le * * On input, *start should be the first offset that does not need to be * freed (e.g. "offset + length"). On return, *start will be the first - * offset that should be freed. + * offset that should be freed and l1blks is set to the number of level 1 + * indirect blocks found within the chunk. */ static int -get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum) +get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks) { + uint64_t blks; uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); /* bytes of data covered by a level-1 indirect block */ uint64_t iblkrange = @@ -695,13 +699,23 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t ASSERT3U(minimum, <=, *start); - if (*start - minimum <= iblkrange * maxblks) { + /* + * Check if we can free the entire range assuming that all of the + * L1 blocks in this range have data. If we can, we use this + * worst case value as an estimate so we can avoid having to look + * at the object's actual data. + */ + uint64_t total_l1blks = + (roundup(*start, iblkrange) - (minimum / iblkrange * iblkrange)) / + iblkrange; + if (total_l1blks <= maxblks) { + *l1blks = total_l1blks; *start = minimum; return (0); } ASSERT(ISP2(iblkrange)); - for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) { + for (blks = 0; *start > minimum && blks < maxblks; blks++) { int err; /* @@ -711,6 +725,7 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t * to search. */ (*start)--; + err = dnode_next_offset(dn, DNODE_FIND_BACKWARDS, start, 2, 1, 0); @@ -719,6 +734,7 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t *start = minimum; break; } else if (err != 0) { + *l1blks = blks; return (err); } @@ -727,6 +743,8 @@ get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t } if (*start < minimum) *start = minimum; + *l1blks = blks; + return (0); } @@ -762,14 +780,14 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, ui dirty_frees_threshold = zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100; else - dirty_frees_threshold = zfs_dirty_data_max / 4; + dirty_frees_threshold = zfs_dirty_data_max / 20; if (length == DMU_OBJECT_END || offset + length > object_size) length = object_size - offset; while (length != 0) { uint64_t chunk_end, chunk_begin, chunk_len; - uint64_t long_free_dirty_all_txgs = 0; + uint64_t l1blks; dmu_tx_t *tx; if (dmu_objset_zfs_unmounting(dn->dn_objset)) @@ -778,7 +796,7 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, ui chunk_end = chunk_begin = offset + length; /* move chunk_begin backwards to the beginning of this chunk */ - err = get_next_chunk(dn, &chunk_begin, offset); + err = get_next_chunk(dn, &chunk_begin, offset, &l1blks); if (err) return (err); ASSERT3U(chunk_begin, >=, offset); @@ -786,24 +804,6 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, ui chunk_len = chunk_end - chunk_begin; - mutex_enter(&dp->dp_lock); - for (int t = 0; t < TXG_SIZE; t++) { - long_free_dirty_all_txgs += - dp->dp_long_free_dirty_pertxg[t]; - } - mutex_exit(&dp->dp_lock); - - /* - * To avoid filling up a TXG with just frees wait for - * the next TXG to open before freeing more chunks if - * we have reached the threshold of frees - */ - if (dirty_frees_threshold != 0 && - long_free_dirty_all_txgs >= dirty_frees_threshold) { - txg_wait_open(dp, 0); - continue; - } - tx = dmu_tx_create(os); dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len); @@ -818,13 +818,42 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, ui return (err); } + uint64_t txg = dmu_tx_get_txg(tx); + mutex_enter(&dp->dp_lock); - dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] += - chunk_len; + uint64_t long_free_dirty = + dp->dp_long_free_dirty_pertxg[txg & TXG_MASK]; mutex_exit(&dp->dp_lock); + + /* + * To avoid filling up a TXG with just frees, wait for + * the next TXG to open before freeing more chunks if + * we have reached the threshold of frees. + */ + if (dirty_frees_threshold != 0 && + long_free_dirty >= dirty_frees_threshold) { + dmu_tx_commit(tx); + txg_wait_open(dp, 0); + continue; + } + + /* + * In order to prevent unnecessary write throttling, for each + * TXG, we track the cumulative size of L1 blocks being dirtied + * in dnode_free_range() below. We compare this number to a + * tunable threshold, past which we prevent new L1 dirty freeing + * blocks from being added into the open TXG. See + * dmu_free_long_range_impl() for details. The threshold + * prevents write throttle activation due to dirty freeing L1 + * blocks taking up a large percentage of zfs_dirty_data_max. + */ + mutex_enter(&dp->dp_lock); + dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] += + l1blks << dn->dn_indblkshift; + mutex_exit(&dp->dp_lock); DTRACE_PROBE3(free__long__range, - uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len, - uint64_t, dmu_tx_get_txg(tx)); + uint64_t, long_free_dirty, uint64_t, chunk_len, + uint64_t, txg); dnode_free_range(dn, chunk_begin, chunk_len, tx); dmu_tx_commit(tx);
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201910111459.x9BExSGP021208>