Date: Thu, 21 Nov 2019 13:46:17 +0000 (UTC) From: Andriy Gapon <avg@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org Subject: svn commit: r354949 - vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/sys vendor-sys/illumos/dist/uts/common/sys vendor-sys/illumos/dist/uts/common/sys/fs vendor... Message-ID: <201911211346.xALDkHZ8035760@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: avg Date: Thu Nov 21 13:46:16 2019 New Revision: 354949 URL: https://svnweb.freebsd.org/changeset/base/354949 Log: 10405 Implement ZFS sorted scans illumos/illumos-gate@a3874b8b1fe5103fc1f961609557c0587435fec0 https://github.com/illumos/illumos-gate/commit/a3874b8b1fe5103fc1f961609557c0587435fec0 https://www.illumos.org/issues/10405 The original implementation is: https://github.com/zfsonlinux/zfs/commit/d4a72f23863382bdf6d0ae33196f5b5decbc48fd Author: Toomas Soome <tsoome@me.com> Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c vendor-sys/illumos/dist/uts/common/fs/zfs/ddt.c vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c vendor-sys/illumos/dist/uts/common/fs/zfs/sys/arc.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_pool.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_scan.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/range_tree.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa_impl.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_disk.c vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_file.c vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_indirect.c vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_mirror.c vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_missing.c vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_queue.c vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_raidz.c vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_root.c vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c vendor-sys/illumos/dist/uts/common/sys/fs/zfs.h vendor-sys/illumos/dist/uts/common/sys/taskq.h Changes in other areas also in this revision: Modified: vendor/illumos/dist/cmd/zdb/zdb.c vendor/illumos/dist/cmd/zpool/zpool_main.c vendor/illumos/dist/cmd/ztest/ztest.c vendor/illumos/dist/lib/libzfs/common/libzfs_status.c Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c Thu Nov 21 13:35:43 2019 (r354948) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c Thu Nov 21 13:46:16 2019 (r354949) @@ -348,7 +348,8 @@ int arc_no_grow_shift = 5; * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) */ -static int arc_min_prefetch_lifespan; +static int zfs_arc_min_prefetch_ms = 1; +static int zfs_arc_min_prescient_prefetch_ms = 6; /* * If this percent of memory is free, don't throttle. @@ -690,8 +691,9 @@ typedef struct arc_stats { kstat_named_t arcstat_meta_limit; kstat_named_t arcstat_meta_max; kstat_named_t arcstat_meta_min; - kstat_named_t arcstat_sync_wait_for_async; + kstat_named_t arcstat_async_upgrade_sync; kstat_named_t arcstat_demand_hit_predictive_prefetch; + kstat_named_t arcstat_demand_hit_prescient_prefetch; } arc_stats_t; static arc_stats_t arc_stats = { @@ -774,8 +776,9 @@ static arc_stats_t arc_stats = { { "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_meta_max", KSTAT_DATA_UINT64 }, { "arc_meta_min", KSTAT_DATA_UINT64 }, - { "sync_wait_for_async", KSTAT_DATA_UINT64 }, + { "async_upgrade_sync", KSTAT_DATA_UINT64 }, { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, + { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -872,22 +875,23 @@ typedef struct arc_callback arc_callback_t; struct arc_callback { void *acb_private; - arc_done_func_t *acb_done; + arc_read_done_func_t *acb_done; arc_buf_t *acb_buf; boolean_t acb_compressed; zio_t *acb_zio_dummy; + zio_t *acb_zio_head; arc_callback_t *acb_next; }; typedef struct arc_write_callback arc_write_callback_t; struct arc_write_callback { - void *awcb_private; - arc_done_func_t *awcb_ready; - arc_done_func_t *awcb_children_ready; - arc_done_func_t *awcb_physdone; - arc_done_func_t *awcb_done; - arc_buf_t *awcb_buf; + void *awcb_private; + arc_write_done_func_t *awcb_ready; + arc_write_done_func_t *awcb_children_ready; + arc_write_done_func_t *awcb_physdone; + arc_write_done_func_t *awcb_done; + arc_buf_t *awcb_buf; }; /* @@ -1013,6 +1017,8 @@ struct arc_buf_hdr { #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) +#define HDR_PRESCIENT_PREFETCH(hdr) \ + ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) #define HDR_COMPRESSION_ENABLED(hdr) \ ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) @@ -3243,6 +3249,8 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { arc_state_t *evicted_state, *state; int64_t bytes_evicted = 0; + int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? + zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms; ASSERT(MUTEX_HELD(hash_lock)); ASSERT(HDR_HAS_L1HDR(hdr)); @@ -3295,8 +3303,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) /* prefetch buffers have a minimum lifespan */ if (HDR_IO_IN_PROGRESS(hdr) || ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && - ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < - arc_min_prefetch_lifespan)) { + ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) { ARCSTAT_BUMP(arcstat_evict_skip); return (bytes_evicted); } @@ -4607,13 +4614,15 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * - move the buffer to the head of the list if this is * another prefetch (to make it less likely to be evicted). */ - if (HDR_PREFETCH(hdr)) { + if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { /* link protected by hash lock */ ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { - arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREFETCH | + ARC_FLAG_PRESCIENT_PREFETCH); ARCSTAT_BUMP(arcstat_mru_hits); } hdr->b_l1hdr.b_arc_access = now; @@ -4644,10 +4653,13 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * MFU state. */ - if (HDR_PREFETCH(hdr)) { + if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { new_state = arc_mru; - if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) - arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); + if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREFETCH | + ARC_FLAG_PRESCIENT_PREFETCH); + } DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; @@ -4668,11 +4680,6 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * If it was a prefetch, we will explicitly move it to * the head of the list now. */ - if ((HDR_PREFETCH(hdr)) != 0) { - ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - /* link protected by hash_lock */ - ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node)); - } ARCSTAT_BUMP(arcstat_mfu_hits); hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { @@ -4683,12 +4690,11 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * MFU state. */ - if (HDR_PREFETCH(hdr)) { + if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { /* * This is a prefetch access... * move this block back to the MRU state. */ - ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); new_state = arc_mru; } @@ -4710,21 +4716,26 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) } } -/* a generic arc_done_func_t which you can use */ +/* a generic arc_read_done_func_t which you can use */ /* ARGSUSED */ void -arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) +arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, + arc_buf_t *buf, void *arg) { - if (zio == NULL || zio->io_error == 0) - bcopy(buf->b_data, arg, arc_buf_size(buf)); + if (buf == NULL) + return; + + bcopy(buf->b_data, arg, arc_buf_size(buf)); arc_buf_destroy(buf, arg); } -/* a generic arc_done_func_t */ +/* a generic arc_read_done_func_t */ void -arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) +arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, + arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; + if (buf == NULL) { ASSERT(zio == NULL || zio->io_error != 0); *bufp = NULL; @@ -4759,7 +4770,6 @@ arc_read_done(zio_t *zio) arc_callback_t *callback_list; arc_callback_t *acb; boolean_t freeable = B_FALSE; - boolean_t no_zio_error = (zio->io_error == 0); /* * The hdr was inserted into hash-table and removed from lists @@ -4785,7 +4795,7 @@ arc_read_done(zio_t *zio) ASSERT3P(hash_lock, !=, NULL); } - if (no_zio_error) { + if (zio->io_error == 0) { /* byteswap if necessary */ if (BP_SHOULD_BYTESWAP(zio->io_bp)) { if (BP_GET_LEVEL(zio->io_bp) > 0) { @@ -4806,7 +4816,8 @@ arc_read_done(zio_t *zio) callback_list = hdr->b_l1hdr.b_acb; ASSERT3P(callback_list, !=, NULL); - if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) { + if (hash_lock && zio->io_error == 0 && + hdr->b_l1hdr.b_state == arc_anon) { /* * Only call arc_access on anonymous buffers. This is because * if we've issued an I/O for an evicted buffer, we've already @@ -4827,30 +4838,29 @@ arc_read_done(zio_t *zio) if (!acb->acb_done) continue; - /* This is a demand read since prefetches don't use callbacks */ callback_cnt++; - if (no_zio_error) { - int error = arc_buf_alloc_impl(hdr, acb->acb_private, - acb->acb_compressed, zio->io_error == 0, - &acb->acb_buf); - if (error != 0) { - /* - * Decompression failed. Set io_error - * so that when we call acb_done (below), - * we will indicate that the read failed. - * Note that in the unusual case where one - * callback is compressed and another - * uncompressed, we will mark all of them - * as failed, even though the uncompressed - * one can't actually fail. In this case, - * the hdr will not be anonymous, because - * if there are multiple callbacks, it's - * because multiple threads found the same - * arc buf in the hash table. - */ - zio->io_error = error; - } + if (zio->io_error != 0) + continue; + + int error = arc_buf_alloc_impl(hdr, acb->acb_private, + acb->acb_compressed, B_TRUE, &acb->acb_buf); + if (error != 0) { + /* + * Decompression failed. Set io_error + * so that when we call acb_done (below), + * we will indicate that the read failed. + * Note that in the unusual case where one + * callback is compressed and another + * uncompressed, we will mark all of them + * as failed, even though the uncompressed + * one can't actually fail. In this case, + * the hdr will not be anonymous, because + * if there are multiple callbacks, it's + * because multiple threads found the same + * arc buf in the hash table. + */ + zio->io_error = error; } } /* @@ -4873,7 +4883,7 @@ arc_read_done(zio_t *zio) ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || callback_list != NULL); - if (no_zio_error) { + if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); @@ -4916,7 +4926,8 @@ arc_read_done(zio_t *zio) arc_buf_destroy(acb->acb_buf, acb->acb_private); acb->acb_buf = NULL; } - acb->acb_done(zio, acb->acb_buf, acb->acb_private); + acb->acb_done(zio, &zio->io_bookmark, zio->io_bp, + acb->acb_buf, acb->acb_private); } if (acb->acb_zio_dummy != NULL) { @@ -4951,7 +4962,7 @@ arc_read_done(zio_t *zio) * for readers of this block. */ int -arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, +arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { @@ -4960,6 +4971,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, a zio_t *rzio; uint64_t guid = spa_load_guid(spa); boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0; + int rc = 0; ASSERT(!BP_IS_EMBEDDED(bp) || BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); @@ -4978,32 +4990,20 @@ top: *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { + zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; + ASSERT3P(head_zio, !=, NULL); if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && priority == ZIO_PRIORITY_SYNC_READ) { /* - * This sync read must wait for an - * in-progress async read (e.g. a predictive - * prefetch). Async reads are queued - * separately at the vdev_queue layer, so - * this is a form of priority inversion. - * Ideally, we would "inherit" the demand - * i/o's priority by moving the i/o from - * the async queue to the synchronous queue, - * but there is currently no mechanism to do - * so. Track this so that we can evaluate - * the magnitude of this potential performance - * problem. - * - * Note that if the prefetch i/o is already - * active (has been issued to the device), - * the prefetch improved performance, because - * we issued it sooner than we would have - * without the prefetch. + * This is a sync read that needs to wait for + * an in-flight async read. Request that the + * zio have its priority upgraded. */ - DTRACE_PROBE1(arc__sync__wait__for__async, + zio_change_priority(head_zio, priority); + DTRACE_PROBE1(arc__async__upgrade__sync, arc_buf_hdr_t *, hdr); - ARCSTAT_BUMP(arcstat_sync_wait_for_async); + ARCSTAT_BUMP(arcstat_async_upgrade_sync); } if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { arc_hdr_clear_flags(hdr, @@ -5030,6 +5030,7 @@ top: spa, NULL, NULL, NULL, zio_flags); ASSERT3P(acb->acb_done, !=, NULL); + acb->acb_zio_head = head_zio; acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb = acb; mutex_exit(hash_lock); @@ -5057,17 +5058,33 @@ top: arc_hdr_clear_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); } + + if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) { + ARCSTAT_BUMP( + arcstat_demand_hit_prescient_prefetch); + arc_hdr_clear_flags(hdr, + ARC_FLAG_PRESCIENT_PREFETCH); + } + ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); /* Get a buf with the desired data in it. */ - VERIFY0(arc_buf_alloc_impl(hdr, private, - compressed_read, B_TRUE, &buf)); + rc = arc_buf_alloc_impl(hdr, private, + compressed_read, B_TRUE, &buf); + if (rc != 0) { + arc_buf_destroy(buf, private); + buf = NULL; + } + ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || + rc == 0 || rc != ENOENT); } else if (*arc_flags & ARC_FLAG_PREFETCH && zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); + if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); if (*arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); mutex_exit(hash_lock); @@ -5077,7 +5094,7 @@ top: data, metadata, hits); if (done) - done(NULL, buf, private); + done(NULL, zb, bp, buf, private); } else { uint64_t lsize = BP_GET_LSIZE(bp); uint64_t psize = BP_GET_PSIZE(bp); @@ -5151,6 +5168,9 @@ top: if (*arc_flags & ARC_FLAG_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); + if (*arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (BP_GET_LEVEL(bp) > 0) @@ -5180,14 +5200,17 @@ top: vd = NULL; } - if (priority == ZIO_PRIORITY_ASYNC_READ) + /* + * We count both async reads and scrub IOs as asynchronous so + * that both can be upgraded in the event of a cache hit while + * the read IO is still in-flight. + */ + if (priority == ZIO_PRIORITY_ASYNC_READ || + priority == ZIO_PRIORITY_SCRUB) arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); else arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); - if (hash_lock != NULL) - mutex_exit(hash_lock); - /* * At this point, we have a level 1 cache miss. Try again in * L2ARC if possible. @@ -5257,6 +5280,11 @@ top: ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); + acb->acb_zio_head = rzio; + + if (hash_lock != NULL) + mutex_exit(hash_lock); + DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); ARCSTAT_INCR(arcstat_l2_read_bytes, size); @@ -5271,6 +5299,8 @@ top: return (0); /* l2arc read error; goto zio_read() */ + if (hash_lock != NULL) + mutex_enter(hash_lock); } else { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); @@ -5291,7 +5321,11 @@ top: rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size, arc_read_done, hdr, priority, zio_flags, zb); + acb->acb_zio_head = rzio; + if (hash_lock != NULL) + mutex_exit(hash_lock); + if (*arc_flags & ARC_FLAG_WAIT) return (zio_wait(rzio)); @@ -5778,9 +5812,9 @@ arc_write_done(zio_t *zio) zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, - boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, - arc_done_func_t *children_ready, arc_done_func_t *physdone, - arc_done_func_t *done, void *private, zio_priority_t priority, + boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, + arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone, + arc_write_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; @@ -6191,9 +6225,6 @@ arc_init(void) #endif mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL); - - /* Convert seconds to clock ticks */ - arc_min_prefetch_lifespan = 1 * hz; /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ arc_c_min = MAX(allmem / 32, 64 << 20); Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c Thu Nov 21 13:35:43 2019 (r354948) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c Thu Nov 21 13:46:16 2019 (r354949) @@ -942,7 +942,8 @@ dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t o } static void -dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) +dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, + arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; @@ -961,15 +962,19 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT3P(db->db_buf, ==, NULL); db->db_state = DB_UNCACHED; } else if (db->db_level == 0 && db->db_freed_in_flight) { - /* freed in flight */ + /* we were freed in flight; disregard any error */ ASSERT(zio == NULL || zio->io_error == 0); + if (buf == NULL) { + buf = arc_alloc_buf(db->db_objset->os_spa, + db, DBUF_GET_BUFC_TYPE(db), db->db.db_size); + } arc_release(buf, db); bzero(buf->b_data, db->db.db_size); arc_buf_freeze(buf); db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); db->db_state = DB_CACHED; - } else { + } else if (buf != NULL) { /* success */ ASSERT(zio == NULL || zio->io_error == 0); dbuf_set_data(db, buf); @@ -2395,7 +2400,8 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, bl * prefetch if the next block down is our target. */ static void -dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) +dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, + const blkptr_t *iobp, arc_buf_t *abuf, void *private) { dbuf_prefetch_arg_t *dpa = private; @@ -2442,11 +2448,11 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abu } dpa->dpa_curlevel--; - uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); blkptr_t *bp = ((blkptr_t *)abuf->b_data) + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); + if (BP_IS_HOLE(bp)) { kmem_free(dpa, sizeof (*dpa)); } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { @@ -3852,7 +3858,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, d * ready callback so that we can properly handle an indirect * block that only contains holes. */ - arc_done_func_t *children_ready_cb = NULL; + arc_write_done_func_t *children_ready_cb = NULL; if (db->db_level != 0) children_ready_cb = dbuf_write_children_ready; Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/ddt.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/ddt.c Thu Nov 21 13:35:43 2019 (r354948) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/ddt.c Thu Nov 21 13:46:16 2019 (r354949) @@ -1106,14 +1106,26 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) void ddt_sync(spa_t *spa, uint64_t txg) { + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; dmu_tx_t *tx; - zio_t *rio = zio_root(spa, NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL); + zio_t *rio; ASSERT(spa_syncing_txg(spa) == txg); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + rio = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL); + + /* + * This function may cause an immediate scan of ddt blocks (see + * the comment above dsl_scan_ddt() for details). We set the + * scan's root zio here so that we can wait for any scan IOs in + * addition to the regular ddt IOs. + */ + ASSERT3P(scn->scn_zio_root, ==, NULL); + scn->scn_zio_root = rio; + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; if (ddt == NULL) @@ -1123,6 +1135,7 @@ ddt_sync(spa_t *spa, uint64_t txg) } (void) zio_wait(rio); + scn->scn_zio_root = NULL; dmu_tx_commit(tx); } Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c Thu Nov 21 13:35:43 2019 (r354948) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c Thu Nov 21 13:46:16 2019 (r354949) @@ -398,6 +398,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, bl ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); +#if 0 /* * The $ORIGIN dataset (if it exists) doesn't have an associated * objset, so there's no reason to open it. The $ORIGIN dataset @@ -408,6 +409,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, bl ASSERT3P(ds->ds_dir, !=, spa_get_dsl(spa)->dp_origin_snap->ds_dir); } +#endif os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); os->os_dsl_dataset = ds; Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c Thu Nov 21 13:35:43 2019 (r354948) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c Thu Nov 21 13:46:16 2019 (r354949) @@ -492,7 +492,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { prefetch_data_t *pfd = arg; - arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; + arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH | + ARC_FLAG_PRESCIENT_PREFETCH; ASSERT(pfd->pd_bytes_fetched >= 0); if (bp == NULL) Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c Thu Nov 21 13:35:43 2019 (r354948) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c Thu Nov 21 13:46:16 2019 (r354949) @@ -51,34 +51,157 @@ #include <sys/sa_impl.h> #include <sys/zfeature.h> #include <sys/abd.h> +#include <sys/range_tree.h> #ifdef _KERNEL #include <sys/zfs_vfsops.h> #endif +/* + * Grand theory statement on scan queue sorting + * + * Scanning is implemented by recursively traversing all indirection levels + * in an object and reading all blocks referenced from said objects. This + * results in us approximately traversing the object from lowest logical + * offset to the highest. For best performance, we would want the logical + * blocks to be physically contiguous. However, this is frequently not the + * case with pools given the allocation patterns of copy-on-write filesystems. + * So instead, we put the I/Os into a reordering queue and issue them in a + * way that will most benefit physical disks (LBA-order). + * + * Queue management: + * + * Ideally, we would want to scan all metadata and queue up all block I/O + * prior to starting to issue it, because that allows us to do an optimal + * sorting job. This can however consume large amounts of memory. Therefore + * we continuously monitor the size of the queues and constrain them to 5% + * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this + * limit, we clear out a few of the largest extents at the head of the queues + * to make room for more scanning. Hopefully, these extents will be fairly + * large and contiguous, allowing us to approach sequential I/O throughput + * even without a fully sorted tree. + * + * Metadata scanning takes place in dsl_scan_visit(), which is called from + * dsl_scan_sync() every spa_sync(). If we have either fully scanned all + * metadata on the pool, or we need to make room in memory because our + * queues are too large, dsl_scan_visit() is postponed and + * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies + * that metadata scanning and queued I/O issuing are mutually exclusive. This + * allows us to provide maximum sequential I/O throughput for the majority of + * I/O's issued since sequential I/O performance is significantly negatively + * impacted if it is interleaved with random I/O. + * + * Implementation Notes + * + * One side effect of the queued scanning algorithm is that the scanning code + * needs to be notified whenever a block is freed. This is needed to allow + * the scanning code to remove these I/Os from the issuing queue. Additionally, + * we do not attempt to queue gang blocks to be issued sequentially since this + * is very hard to do and would have an extremely limited performance benefit. + * Instead, we simply issue gang I/Os as soon as we find them using the legacy + * algorithm. + * + * Backwards compatibility + * + * This new algorithm is backwards compatible with the legacy on-disk data + * structures (and therefore does not require a new feature flag). + * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan + * will stop scanning metadata (in logical order) and wait for all outstanding + * sorted I/O to complete. Once this is done, we write out a checkpoint + * bookmark, indicating that we have scanned everything logically before it. + * If the pool is imported on a machine without the new sorting algorithm, + * the scan simply resumes from the last checkpoint using the legacy algorithm. + */ + typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_phys_t *); static scan_cb_t dsl_scan_scrub_cb; -static void dsl_scan_cancel_sync(void *, dmu_tx_t *); -static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *); -static boolean_t dsl_scan_restarting(dsl_scan_t *, dmu_tx_t *); +static int scan_ds_queue_compare(const void *a, const void *b); +static int scan_prefetch_queue_compare(const void *a, const void *b); +static void scan_ds_queue_clear(dsl_scan_t *scn); +static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, + uint64_t *txg); +static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg); +static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj); +static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx); + +extern int zfs_vdev_async_write_active_min_dirty_percent; + +/* + * By default zfs will check to ensure it is not over the hard memory + * limit before each txg. If finer-grained control of this is needed + * this value can be set to 1 to enable checking before scanning each + * block. + */ +int zfs_scan_strict_mem_lim = B_FALSE; + +/* + * Maximum number of parallelly executing I/Os per top-level vdev. + * Tune with care. Very high settings (hundreds) are known to trigger + * some firmware bugs and resets on certain SSDs. + */ int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */ -int zfs_resilver_delay = 2; /* number of ticks to delay resilver */ -int zfs_scrub_delay = 4; /* number of ticks to delay scrub */ -int zfs_scan_idle = 50; /* idle window in clock ticks */ +unsigned int zfs_resilver_delay = 2; /* number of ticks to delay resilver */ +unsigned int zfs_scrub_delay = 4; /* number of ticks to delay scrub */ +unsigned int zfs_scan_idle = 50; /* idle window in clock ticks */ -int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ -int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ -int zfs_obsolete_min_time_ms = 500; /* min millisecs to obsolete per txg */ -int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ +/* + * Maximum number of parallelly executed bytes per leaf vdev. We attempt + * to strike a balance here between keeping the vdev queues full of I/Os + * at all times and not overflowing the queues to cause long latency, + * which would cause long txg sync times. No matter what, we will not + * overload the drives with I/O, since that is protected by + * zfs_vdev_scrub_max_active. + */ +unsigned long zfs_scan_vdev_limit = 4 << 20; + +int zfs_scan_issue_strategy = 0; +int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */ +uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */ + +unsigned int zfs_scan_checkpoint_intval = 7200; /* seconds */ +#define ZFS_SCAN_CHECKPOINT_INTVAL SEC_TO_TICK(zfs_scan_checkpoint_intval) + +/* + * fill_weight is non-tunable at runtime, so we copy it at module init from + * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would + * break queue sorting. + */ +uint64_t zfs_scan_fill_weight = 3; +static uint64_t fill_weight; + +/* See dsl_scan_should_clear() for details on the memory limit tunables */ +uint64_t zfs_scan_mem_lim_min = 16 << 20; /* bytes */ +uint64_t zfs_scan_mem_lim_soft_max = 128 << 20; /* bytes */ +int zfs_scan_mem_lim_fact = 20; /* fraction of physmem */ +int zfs_scan_mem_lim_soft_fact = 20; /* fraction of mem lim above */ + +unsigned int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */ +unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ +/* min millisecs to obsolete per txg */ +unsigned int zfs_obsolete_min_time_ms = 500; +/* min millisecs to resilver per txg */ +unsigned int zfs_resilver_min_time_ms = 3000; boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; -int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */ /* max number of blocks to free in a single TXG */ uint64_t zfs_async_block_max_blocks = UINT64_MAX; +/* + * We wait a few txgs after importing a pool to begin scanning so that + * the import / mounting code isn't held up by scrub / resilver IO. + * Unfortunately, it is a bit difficult to determine exactly how long + * this will take since userspace will trigger fs mounts asynchronously + * and the kernel will create zvol minors asynchronously. As a result, + * the value provided here is a bit arbitrary, but represents a + * reasonable estimate of how many txgs it will take to finish fully + * importing a pool + */ +#define SCAN_IMPORT_WAIT_TXGS 5 + + #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) @@ -97,6 +220,163 @@ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ }; +/* In core node for the scn->scn_queue. Represents a dataset to be scanned */ +typedef struct { + uint64_t sds_dsobj; + uint64_t sds_txg; + avl_node_t sds_node; +} scan_ds_t; + +/* + * This controls what conditions are placed on dsl_scan_sync_state(): + * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0 + * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0. + * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise + * write out the scn_phys_cached version. + * See dsl_scan_sync_state for details. + */ +typedef enum { + SYNC_OPTIONAL, + SYNC_MANDATORY, + SYNC_CACHED +} state_sync_type_t; + +/* + * This struct represents the minimum information needed to reconstruct a + * zio for sequential scanning. This is useful because many of these will + * accumulate in the sequential IO queues before being issued, so saving + * memory matters here. + */ +typedef struct scan_io { + /* fields from blkptr_t */ + uint64_t sio_offset; + uint64_t sio_blk_prop; + uint64_t sio_phys_birth; + uint64_t sio_birth; + zio_cksum_t sio_cksum; + uint32_t sio_asize; + + /* fields from zio_t */ + int sio_flags; + zbookmark_phys_t sio_zb; + + /* members for queue sorting */ + union { + avl_node_t sio_addr_node; /* link into issueing queue */ + list_node_t sio_list_node; /* link for issuing to disk */ + } sio_nodes; +} scan_io_t; + +struct dsl_scan_io_queue { + dsl_scan_t *q_scn; /* associated dsl_scan_t */ + vdev_t *q_vd; /* top-level vdev that this queue represents */ + + /* trees used for sorting I/Os and extents of I/Os */ + range_tree_t *q_exts_by_addr; + avl_tree_t q_exts_by_size; + avl_tree_t q_sios_by_addr; + + /* members for zio rate limiting */ + uint64_t q_maxinflight_bytes; + uint64_t q_inflight_bytes; + kcondvar_t q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */ + + /* per txg statistics */ + uint64_t q_total_seg_size_this_txg; + uint64_t q_segs_this_txg; + uint64_t q_total_zio_size_this_txg; + uint64_t q_zios_this_txg; +}; + +/* private data for dsl_scan_prefetch_cb() */ +typedef struct scan_prefetch_ctx { + zfs_refcount_t spc_refcnt; /* refcount for memory management */ + dsl_scan_t *spc_scn; /* dsl_scan_t for the pool */ + boolean_t spc_root; /* is this prefetch for an objset? */ + uint8_t spc_indblkshift; /* dn_indblkshift of current dnode */ + uint16_t spc_datablkszsec; /* dn_idatablkszsec of current dnode */ +} scan_prefetch_ctx_t; + +/* private data for dsl_scan_prefetch() */ +typedef struct scan_prefetch_issue_ctx { + avl_node_t spic_avl_node; /* link into scn->scn_prefetch_queue */ + scan_prefetch_ctx_t *spic_spc; /* spc for the callback */ + blkptr_t spic_bp; /* bp to prefetch */ + zbookmark_phys_t spic_zb; /* bookmark to prefetch */ +} scan_prefetch_issue_ctx_t; + +static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, + const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue); +static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, + scan_io_t *sio); + +static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd); +static void scan_io_queues_destroy(dsl_scan_t *scn); + +static kmem_cache_t *sio_cache; + +void +scan_init(void) +{ + /* + * This is used in ext_size_compare() to weight segments + * based on how sparse they are. This cannot be changed + * mid-scan and the tree comparison functions don't currently + * have a mechansim for passing additional context to the + * compare functions. Thus we store this value globally and + * we only allow it to be set at module intiailization time + */ + fill_weight = zfs_scan_fill_weight; + + sio_cache = kmem_cache_create("sio_cache", + sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +scan_fini(void) +{ + kmem_cache_destroy(sio_cache); +} + +static inline boolean_t +dsl_scan_is_running(const dsl_scan_t *scn) +{ + return (scn->scn_phys.scn_state == DSS_SCANNING); +} + +boolean_t +dsl_scan_resilvering(dsl_pool_t *dp) +{ + return (dsl_scan_is_running(dp->dp_scan) && + dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); +} + +static inline void +sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id) +{ + bzero(bp, sizeof (*bp)); + DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize); + DVA_SET_VDEV(&bp->blk_dva[0], vdev_id); + DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset); + bp->blk_prop = sio->sio_blk_prop; + bp->blk_phys_birth = sio->sio_phys_birth; + bp->blk_birth = sio->sio_birth; + bp->blk_fill = 1; /* we always only work with data pointers */ + bp->blk_cksum = sio->sio_cksum; +} + +static inline void +bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i) +{ + /* we discard the vdev id, since we can deduce it from the queue */ + sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]); + sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]); + sio->sio_blk_prop = bp->blk_prop; + sio->sio_phys_birth = bp->blk_phys_birth; + sio->sio_birth = bp->blk_birth; + sio->sio_cksum = bp->blk_cksum; +} + int dsl_scan_init(dsl_pool_t *dp, uint64_t txg) { @@ -117,6 +397,13 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY); + bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys)); + avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), + offsetof(scan_ds_t, sds_node)); + avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare, + sizeof (scan_prefetch_issue_ctx_t), + offsetof(scan_prefetch_issue_ctx_t, spic_avl_node)); + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, "scrub_func", sizeof (uint64_t), 1, &f); if (err == 0) { @@ -127,7 +414,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) scn->scn_restart_txg = txg; zfs_dbgmsg("old-style scrub was in progress; " "restarting new-style scrub in txg %llu", - scn->scn_restart_txg); + (longlong_t)scn->scn_restart_txg); /* * Load the queue obj from the old location so that it @@ -145,7 +432,14 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) else if (err) return (err); - if (scn->scn_phys.scn_state == DSS_SCANNING && + /* + * We might be restarting after a reboot, so jump the issued + * counter to how far we've scanned. We know we're consistent + * up to here. + */ + scn->scn_issued_before_pass = scn->scn_phys.scn_examined; + + if (dsl_scan_is_running(scn) && spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { /* * A new-type scrub was in progress on an old @@ -157,10 +451,26 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) scn->scn_restart_txg = txg; zfs_dbgmsg("new-style scrub was modified " "by old software; restarting in txg %llu", - scn->scn_restart_txg); + (longlong_t)scn->scn_restart_txg); } *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201911211346.xALDkHZ8035760>