From owner-svn-src-vendor@FreeBSD.ORG Thu Sep 5 17:52:57 2013 Return-Path: Delivered-To: svn-src-vendor@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) (using TLSv1 with cipher ADH-AES256-SHA (256/256 bits)) (No client certificate requested) by hub.freebsd.org (Postfix) with ESMTP id D884377D; Thu, 5 Sep 2013 17:52:56 +0000 (UTC) (envelope-from delphij@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mx1.freebsd.org (Postfix) with ESMTPS id C2E072445; Thu, 5 Sep 2013 17:52:56 +0000 (UTC) Received: from svn.freebsd.org ([127.0.1.70]) by svn.freebsd.org (8.14.7/8.14.7) with ESMTP id r85HquU2092738; Thu, 5 Sep 2013 17:52:56 GMT (envelope-from delphij@svn.freebsd.org) Received: (from delphij@localhost) by svn.freebsd.org (8.14.7/8.14.5/Submit) id r85Hqtkq092729; Thu, 5 Sep 2013 17:52:55 GMT (envelope-from delphij@svn.freebsd.org) Message-Id: <201309051752.r85Hqtkq092729@svn.freebsd.org> From: Xin LI Date: Thu, 5 Sep 2013 17:52:55 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org Subject: svn commit: r255255 - vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/sys vendor/illumos/dist/cmd/ztest vendor/illumos/dist/lib/libzpool/common/sys X-SVN-Group: vendor-sys MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-vendor@freebsd.org X-Mailman-Version: 2.1.14 Precedence: list List-Id: SVN commit messages for the vendor work area tree List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 05 Sep 2013 17:52:57 -0000 Author: delphij Date: Thu Sep 5 17:52:54 2013 New Revision: 255255 URL: http://svnweb.freebsd.org/changeset/base/255255 Log: Update vendor/illumos/dist and vendor-sys/illumos/dist to 14164:dceb17481b99: Illumos ZFS issues: 4045 zfs write throttle & i/o scheduler performance work Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_tx.c vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_zfetch.c vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dir.c vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_pool.c vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c vendor-sys/illumos/dist/uts/common/fs/zfs/sys/arc.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu_tx.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_dir.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dsl_pool.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/sa_impl.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa_impl.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/txg.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/txg_impl.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev_impl.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_context.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h vendor-sys/illumos/dist/uts/common/fs/zfs/txg.c vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_cache.c vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_mirror.c vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_queue.c vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_raidz.c vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vnops.c vendor-sys/illumos/dist/uts/common/fs/zfs/zil.c vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c Changes in other areas also in this revision: Modified: vendor/illumos/dist/cmd/ztest/ztest.c vendor/illumos/dist/lib/libzpool/common/sys/zfs_context.h Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c Thu Sep 5 16:38:26 2013 (r255254) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c Thu Sep 5 17:52:54 2013 (r255255) @@ -127,6 +127,7 @@ #include #include #include +#include #ifdef _KERNEL #include #include @@ -147,10 +148,6 @@ static kmutex_t arc_reclaim_thr_lock; static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ static uint8_t arc_thread_exit; -extern int zfs_write_limit_shift; -extern uint64_t zfs_write_limit_max; -extern kmutex_t zfs_write_limit_lock; - #define ARC_REDUCE_DNLC_PERCENT 3 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; @@ -159,6 +156,12 @@ typedef enum arc_reclaim_strategy { ARC_RECLAIM_CONS /* Conservative reclaim strategy */ } arc_reclaim_strategy_t; +/* + * The number of iterations through arc_evict_*() before we + * drop & reacquire the lock. + */ +int arc_evict_iterations = 100; + /* number of seconds before growing cache again */ static int arc_grow_retry = 60; @@ -174,6 +177,11 @@ static int arc_shrink_shift = 5; */ static int arc_min_prefetch_lifespan; +/* + * If this percent of memory is free, don't throttle. + */ +int arc_lotsfree_percent = 10; + static int arc_dead; /* @@ -469,6 +477,7 @@ typedef struct arc_write_callback arc_wr struct arc_write_callback { void *awcb_private; arc_done_func_t *awcb_ready; + arc_done_func_t *awcb_physdone; arc_done_func_t *awcb_done; arc_buf_t *awcb_buf; }; @@ -1163,7 +1172,7 @@ arc_change_state(arc_state_t *new_state, uint64_t from_delta, to_delta; ASSERT(MUTEX_HELD(hash_lock)); - ASSERT(new_state != old_state); + ASSERT3P(new_state, !=, old_state); ASSERT(refcnt == 0 || ab->b_datacnt > 0); ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); @@ -1778,6 +1787,8 @@ arc_evict(arc_state_t *state, uint64_t s kmutex_t *hash_lock; boolean_t have_lock; void *stolen = NULL; + arc_buf_hdr_t marker = { 0 }; + int count = 0; ASSERT(state == arc_mru || state == arc_mfu); @@ -1801,6 +1812,33 @@ arc_evict(arc_state_t *state, uint64_t s if (recycle && ab->b_size != bytes && ab_prev && ab_prev->b_size == bytes) continue; + + /* ignore markers */ + if (ab->b_spa == 0) + continue; + + /* + * It may take a long time to evict all the bufs requested. + * To avoid blocking all arc activity, periodically drop + * the arcs_mtx and give other threads a chance to run + * before reacquiring the lock. + * + * If we are looking for a buffer to recycle, we are in + * the hot code path, so don't sleep. + */ + if (!recycle && count++ > arc_evict_iterations) { + list_insert_after(list, ab, &marker); + mutex_exit(&evicted_state->arcs_mtx); + mutex_exit(&state->arcs_mtx); + kpreempt(KPREEMPT_SYNC); + mutex_enter(&state->arcs_mtx); + mutex_enter(&evicted_state->arcs_mtx); + ab_prev = list_prev(list, &marker); + list_remove(list, &marker); + count = 0; + continue; + } + hash_lock = HDR_LOCK(ab); have_lock = MUTEX_HELD(hash_lock); if (have_lock || mutex_tryenter(hash_lock)) { @@ -1882,25 +1920,11 @@ arc_evict(arc_state_t *state, uint64_t s ARCSTAT_INCR(arcstat_mutex_miss, missed); /* - * We have just evicted some data into the ghost state, make - * sure we also adjust the ghost state size if necessary. + * Note: we have just evicted some data into the ghost state, + * potentially putting the ghost size over the desired size. Rather + * that evicting from the ghost list in this hot code path, leave + * this chore to the arc_reclaim_thread(). */ - if (arc_no_grow && - arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { - int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + - arc_mru_ghost->arcs_size - arc_c; - - if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { - int64_t todelete = - MIN(arc_mru_ghost->arcs_lsize[type], mru_over); - arc_evict_ghost(arc_mru_ghost, NULL, todelete); - } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { - int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], - arc_mru_ghost->arcs_size + - arc_mfu_ghost->arcs_size - arc_c); - arc_evict_ghost(arc_mfu_ghost, NULL, todelete); - } - } return (stolen); } @@ -1918,12 +1942,15 @@ arc_evict_ghost(arc_state_t *state, uint kmutex_t *hash_lock; uint64_t bytes_deleted = 0; uint64_t bufs_skipped = 0; + int count = 0; ASSERT(GHOST_STATE(state)); top: mutex_enter(&state->arcs_mtx); for (ab = list_tail(list); ab; ab = ab_prev) { ab_prev = list_prev(list, ab); + if (ab->b_type > ARC_BUFC_NUMTYPES) + panic("invalid ab=%p", (void *)ab); if (spa && ab->b_spa != spa) continue; @@ -1935,6 +1962,23 @@ top: /* caller may be trying to modify this buffer, skip it */ if (MUTEX_HELD(hash_lock)) continue; + + /* + * It may take a long time to evict all the bufs requested. + * To avoid blocking all arc activity, periodically drop + * the arcs_mtx and give other threads a chance to run + * before reacquiring the lock. + */ + if (count++ > arc_evict_iterations) { + list_insert_after(list, ab, &marker); + mutex_exit(&state->arcs_mtx); + kpreempt(KPREEMPT_SYNC); + mutex_enter(&state->arcs_mtx); + ab_prev = list_prev(list, &marker); + list_remove(list, &marker); + count = 0; + continue; + } if (mutex_tryenter(hash_lock)) { ASSERT(!HDR_IO_IN_PROGRESS(ab)); ASSERT(ab->b_buf == NULL); @@ -1970,8 +2014,10 @@ top: mutex_enter(&state->arcs_mtx); ab_prev = list_prev(list, &marker); list_remove(list, &marker); - } else + } else { bufs_skipped += 1; + } + } mutex_exit(&state->arcs_mtx); @@ -2825,7 +2871,7 @@ arc_read_done(zio_t *zio) */ int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, - void *private, int priority, int zio_flags, uint32_t *arc_flags, + void *private, zio_priority_t priority, int zio_flags, uint32_t *arc_flags, const zbookmark_t *zb) { arc_buf_hdr_t *hdr; @@ -3428,6 +3474,18 @@ arc_write_ready(zio_t *zio) hdr->b_flags |= ARC_IO_IN_PROGRESS; } +/* + * The SPA calls this callback for each physical write that happens on behalf + * of a logical write. See the comment in dbuf_write_physdone() for details. + */ +static void +arc_write_physdone(zio_t *zio) +{ + arc_write_callback_t *cb = zio->io_private; + if (cb->awcb_physdone != NULL) + cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); +} + static void arc_write_done(zio_t *zio) { @@ -3508,8 +3566,9 @@ arc_write_done(zio_t *zio) zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, - const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done, - void *private, int priority, int zio_flags, const zbookmark_t *zb) + const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, + arc_done_func_t *done, void *private, zio_priority_t priority, + int zio_flags, const zbookmark_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; @@ -3526,18 +3585,20 @@ arc_write(zio_t *pio, spa_t *spa, uint64 hdr->b_flags |= ARC_L2COMPRESS; callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; + callback->awcb_physdone = physdone; callback->awcb_done = done; callback->awcb_private = private; callback->awcb_buf = buf; zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, - arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); + arc_write_ready, arc_write_physdone, arc_write_done, callback, + priority, zio_flags, zb); return (zio); } static int -arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) +arc_memory_throttle(uint64_t reserve, uint64_t txg) { #ifdef _KERNEL uint64_t available_memory = ptob(freemem); @@ -3548,7 +3609,8 @@ arc_memory_throttle(uint64_t reserve, ui available_memory = MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); #endif - if (available_memory >= zfs_write_limit_max) + + if (freemem > physmem * arc_lotsfree_percent / 100) return (0); if (txg > last_txg) { @@ -3572,20 +3634,6 @@ arc_memory_throttle(uint64_t reserve, ui return (SET_ERROR(EAGAIN)); } page_load = 0; - - if (arc_size > arc_c_min) { - uint64_t evictable_memory = - arc_mru->arcs_lsize[ARC_BUFC_DATA] + - arc_mru->arcs_lsize[ARC_BUFC_METADATA] + - arc_mfu->arcs_lsize[ARC_BUFC_DATA] + - arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; - available_memory += MIN(evictable_memory, arc_size - arc_c_min); - } - - if (inflight_data > available_memory / 4) { - ARCSTAT_INCR(arcstat_memory_throttle_count, 1); - return (SET_ERROR(ERESTART)); - } #endif return (0); } @@ -3603,15 +3651,6 @@ arc_tempreserve_space(uint64_t reserve, int error; uint64_t anon_size; -#ifdef ZFS_DEBUG - /* - * Once in a while, fail for no reason. Everything should cope. - */ - if (spa_get_random(10000) == 0) { - dprintf("forcing random failure\n"); - return (SET_ERROR(ERESTART)); - } -#endif if (reserve > arc_c/4 && !arc_no_grow) arc_c = MIN(arc_c_max, reserve * 4); if (reserve > arc_c) @@ -3629,7 +3668,8 @@ arc_tempreserve_space(uint64_t reserve, * in order to compress/encrypt/etc the data. We therefore need to * make sure that there is sufficient available memory for this. */ - if (error = arc_memory_throttle(reserve, anon_size, txg)) + error = arc_memory_throttle(reserve, txg); + if (error != 0) return (error); /* @@ -3778,11 +3818,20 @@ arc_init(void) arc_dead = FALSE; arc_warm = B_FALSE; - if (zfs_write_limit_max == 0) - zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; - else - zfs_write_limit_shift = 0; - mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL); + /* + * Calculate maximum amount of dirty data per pool. + * + * If it has been set by /etc/system, take that. + * Otherwise, use a percentage of physical memory defined by + * zfs_dirty_data_max_percent (default 10%) with a cap at + * zfs_dirty_data_max_max (default 4GB). + */ + if (zfs_dirty_data_max == 0) { + zfs_dirty_data_max = physmem * PAGESIZE * + zfs_dirty_data_max_percent / 100; + zfs_dirty_data_max = MIN(zfs_dirty_data_max, + zfs_dirty_data_max_max); + } } void @@ -3823,8 +3872,6 @@ arc_fini(void) mutex_destroy(&arc_mfu_ghost->arcs_mtx); mutex_destroy(&arc_l2c_only->arcs_mtx); - mutex_destroy(&zfs_write_limit_lock); - buf_fini(); ASSERT(arc_loaned_bytes == 0); Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c Thu Sep 5 16:38:26 2013 (r255254) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c Thu Sep 5 17:52:54 2013 (r255255) @@ -842,7 +842,7 @@ dbuf_free_range(dnode_t *dn, uint64_t st atomic_inc_64(&zfs_free_range_recv_miss); } - for (db = list_head(&dn->dn_dbufs); db; db = db_next) { + for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) { db_next = list_next(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DMU_BONUS_BLKID); @@ -1188,6 +1188,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } + if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) + dr->dr_accounted = db->db.db_size; dr->dr_dbuf = db; dr->dr_txg = tx->tx_txg; dr->dr_next = *drp; @@ -1271,7 +1273,10 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t dbuf_rele(parent, FTAG); mutex_enter(&db->db_mtx); - /* possible race with dbuf_undirty() */ + /* + * Since we've dropped the mutex, it's possible that + * dbuf_undirty() might have changed this out from under us. + */ if (db->db_last_dirty == dr || dn->dn_object == DMU_META_DNODE_OBJECT) { mutex_enter(&di->dt.di.dr_mtx); @@ -1341,7 +1346,11 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_ ASSERT(db->db.db_size != 0); - /* XXX would be nice to fix up dn_towrite_space[] */ + /* + * Any space we accounted for in dp_dirty_* will be cleaned up by + * dsl_pool_sync(). This is relatively rare so the discrepancy + * is not a big deal. + */ *drp = dr->dr_next; @@ -1521,7 +1530,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, a /* * "Clear" the contents of this dbuf. This will mark the dbuf - * EVICTING and clear *most* of its references. Unfortunetely, + * EVICTING and clear *most* of its references. Unfortunately, * when we are not holding the dn_dbufs_mtx, we can't clear the * entry in the dn_dbufs list. We have to wait until dbuf_destroy() * in this case. For callers from the DMU we will usually see: @@ -1708,7 +1717,7 @@ dbuf_create(dnode_t *dn, uint8_t level, db->db.db_offset = 0; } else { int blocksize = - db->db_level ? 1<dn_indblkshift : dn->dn_datablksz; + db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; db->db.db_size = blocksize; db->db.db_offset = db->db_blkid * blocksize; } @@ -1817,7 +1826,7 @@ dbuf_destroy(dmu_buf_impl_t *db) } void -dbuf_prefetch(dnode_t *dn, uint64_t blkid) +dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) { dmu_buf_impl_t *db = NULL; blkptr_t *bp = NULL; @@ -1841,8 +1850,6 @@ dbuf_prefetch(dnode_t *dn, uint64_t blki if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { if (bp && !BP_IS_HOLE(bp)) { - int priority = dn->dn_type == DMU_OT_DDT_ZAP ? - ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ; dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; zbookmark_t zb; @@ -1851,7 +1858,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blki dn->dn_object, 0, blkid); (void) arc_read(NULL, dn->dn_objset->os_spa, - bp, NULL, NULL, priority, + bp, NULL, NULL, prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zb); } @@ -2532,6 +2539,38 @@ dbuf_write_ready(zio_t *zio, arc_buf_t * mutex_exit(&db->db_mtx); } +/* + * The SPA will call this callback several times for each zio - once + * for every physical child i/o (zio->io_phys_children times). This + * allows the DMU to monitor the progress of each logical i/o. For example, + * there may be 2 copies of an indirect block, or many fragments of a RAID-Z + * block. There may be a long delay before all copies/fragments are completed, + * so this callback allows us to retire dirty space gradually, as the physical + * i/os complete. + */ +/* ARGSUSED */ +static void +dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) +{ + dmu_buf_impl_t *db = arg; + objset_t *os = db->db_objset; + dsl_pool_t *dp = dmu_objset_pool(os); + dbuf_dirty_record_t *dr; + int delta = 0; + + dr = db->db_data_pending; + ASSERT3U(dr->dr_txg, ==, zio->io_txg); + + /* + * The callback will be called io_phys_children times. Retire one + * portion of our dirty space each time we are called. Any rounding + * error will be cleaned up by dsl_pool_sync()'s call to + * dsl_pool_undirty_space(). + */ + delta = dr->dr_accounted / zio->io_phys_children; + dsl_pool_undirty_space(dp, delta, zio->io_txg); +} + /* ARGSUSED */ static void dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) @@ -2626,6 +2665,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *b ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); } @@ -2744,8 +2784,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_ ASSERT(db->db_state != DB_NOFILL); dr->dr_zio = zio_write(zio, os->os_spa, txg, db->db_blkptr, data->b_data, arc_buf_size(data), &zp, - dbuf_write_override_ready, dbuf_write_override_done, dr, - ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + dbuf_write_override_ready, NULL, dbuf_write_override_done, + dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, @@ -2756,7 +2796,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_ zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); dr->dr_zio = zio_write(zio, os->os_spa, txg, db->db_blkptr, NULL, db->db.db_size, &zp, - dbuf_write_nofill_ready, dbuf_write_nofill_done, db, + dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); } else { @@ -2764,7 +2804,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_ dr->dr_zio = arc_write(zio, os->os_spa, txg, db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, - dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_MUSTSUCCEED, &zb); + dbuf_write_physdone, dbuf_write_done, db, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } } Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c Thu Sep 5 16:38:26 2013 (r255254) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c Thu Sep 5 17:52:54 2013 (r255255) @@ -371,13 +371,11 @@ static int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { - dsl_pool_t *dp = NULL; dmu_buf_t **dbp; uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; zio_t *zio; - hrtime_t start; ASSERT(length <= DMU_MAX_ACCESS); @@ -405,9 +403,6 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, } dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); - if (dn->dn_objset->os_dsl_dataset) - dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool; - start = gethrtime(); zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, offset); for (i = 0; i < nblks; i++) { @@ -428,9 +423,6 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, /* wait for async i/o */ err = zio_wait(zio); - /* track read overhead when we are in sync context */ - if (dp && dsl_pool_sync_context(dp)) - dp->dp_read_overhead += gethrtime() - start; if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); @@ -512,12 +504,22 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); } +/* + * Issue prefetch i/os for the given blocks. + * + * Note: The assumption is that we *know* these blocks will be needed + * almost immediately. Therefore, the prefetch i/os will be issued at + * ZIO_PRIORITY_SYNC_READ + * + * Note: indirect blocks and other metadata will be read synchronously, + * causing this function to block if they are not already cached. + */ void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) { dnode_t *dn; uint64_t blkid; - int nblks, i, err; + int nblks, err; if (zfs_prefetch_disable) return; @@ -530,7 +532,7 @@ dmu_prefetch(objset_t *os, uint64_t obje rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t)); - dbuf_prefetch(dn, blkid); + dbuf_prefetch(dn, blkid, ZIO_PRIORITY_SYNC_READ); rw_exit(&dn->dn_struct_rwlock); return; } @@ -547,16 +549,16 @@ dmu_prefetch(objset_t *os, uint64_t obje rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; - nblks = (P2ROUNDUP(offset+len, 1<> blkshift; + nblks = (P2ROUNDUP(offset + len, 1 << blkshift) - + P2ALIGN(offset, 1 << blkshift)) >> blkshift; } else { nblks = (offset < dn->dn_datablksz); } if (nblks != 0) { blkid = dbuf_whichblock(dn, offset); - for (i = 0; i < nblks; i++) - dbuf_prefetch(dn, blkid+i); + for (int i = 0; i < nblks; i++) + dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_SYNC_READ); } rw_exit(&dn->dn_struct_rwlock); @@ -1356,7 +1358,7 @@ dmu_sync_late_arrival(zio_t *pio, objset zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp, - dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa, + dmu_sync_late_arrival_ready, NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); return (0); @@ -1496,8 +1498,9 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_s zio_nowait(arc_write(pio, os->os_spa, txg, bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), - DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, dmu_sync_done, - dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); + DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, + NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, + ZIO_FLAG_CANFAIL, &zb)); return (0); } Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c Thu Sep 5 16:38:26 2013 (r255254) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c Thu Sep 5 17:52:54 2013 (r255255) @@ -1028,7 +1028,7 @@ dmu_objset_sync(objset_t *os, zio_t *pio zio = arc_write(pio, os->os_spa, tx->tx_txg, os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready, - dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, + NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); /* Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_tx.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_tx.c Thu Sep 5 16:38:26 2013 (r255254) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_tx.c Thu Sep 5 17:52:54 2013 (r255255) @@ -54,6 +54,7 @@ dmu_tx_create_dd(dsl_dir_t *dd) offsetof(dmu_tx_hold_t, txh_node)); list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), offsetof(dmu_tx_callback_t, dcb_node)); + tx->tx_start = gethrtime(); #ifdef ZFS_DEBUG refcount_create(&tx->tx_space_written); refcount_create(&tx->tx_space_freed); @@ -597,13 +598,13 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t if (txh == NULL) return; dn = txh->txh_dnode; + dmu_tx_count_dnode(txh); if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) return; if (len == DMU_OBJECT_END) len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; - dmu_tx_count_dnode(txh); /* * For i/o error checking, we read the first and last level-0 @@ -911,6 +912,156 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_i } #endif +/* + * If we can't do 10 iops, something is wrong. Let us go ahead + * and hit zfs_dirty_data_max. + */ +hrtime_t zfs_delay_max_ns = MSEC2NSEC(100); +int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */ + +/* + * We delay transactions when we've determined that the backend storage + * isn't able to accommodate the rate of incoming writes. + * + * If there is already a transaction waiting, we delay relative to when + * that transaction finishes waiting. This way the calculated min_time + * is independent of the number of threads concurrently executing + * transactions. + * + * If we are the only waiter, wait relative to when the transaction + * started, rather than the current time. This credits the transaction for + * "time already served", e.g. reading indirect blocks. + * + * The minimum time for a transaction to take is calculated as: + * min_time = scale * (dirty - min) / (max - dirty) + * min_time is then capped at zfs_delay_max_ns. + * + * The delay has two degrees of freedom that can be adjusted via tunables. + * The percentage of dirty data at which we start to delay is defined by + * zfs_delay_min_dirty_percent. This should typically be at or above + * zfs_vdev_async_write_active_max_dirty_percent so that we only start to + * delay after writing at full speed has failed to keep up with the incoming + * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly + * speaking, this variable determines the amount of delay at the midpoint of + * the curve. + * + * delay + * 10ms +-------------------------------------------------------------*+ + * | *| + * 9ms + *+ + * | *| + * 8ms + *+ + * | * | + * 7ms + * + + * | * | + * 6ms + * + + * | * | + * 5ms + * + + * | * | + * 4ms + * + + * | * | + * 3ms + * + + * | * | + * 2ms + (midpoint) * + + * | | ** | + * 1ms + v *** + + * | zfs_delay_scale ----------> ******** | + * 0 +-------------------------------------*********----------------+ + * 0% <- zfs_dirty_data_max -> 100% + * + * Note that since the delay is added to the outstanding time remaining on the + * most recent transaction, the delay is effectively the inverse of IOPS. + * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve + * was chosen such that small changes in the amount of accumulated dirty data + * in the first 3/4 of the curve yield relatively small differences in the + * amount of delay. + * + * The effects can be easier to understand when the amount of delay is + * represented on a log scale: + * + * delay + * 100ms +-------------------------------------------------------------++ + * + + + * | | + * + *+ + * 10ms + *+ + * + ** + + * | (midpoint) ** | + * + | ** + + * 1ms + v **** + + * + zfs_delay_scale ----------> ***** + + * | **** | + * + **** + + * 100us + ** + + * + * + + * | * | + * + * + + * 10us + * + + * + + + * | | + * + + + * +--------------------------------------------------------------+ + * 0% <- zfs_dirty_data_max -> 100% + * + * Note here that only as the amount of dirty data approaches its limit does + * the delay start to increase rapidly. The goal of a properly tuned system + * should be to keep the amount of dirty data out of that range by first + * ensuring that the appropriate limits are set for the I/O scheduler to reach + * optimal throughput on the backend storage, and then by changing the value + * of zfs_delay_scale to increase the steepness of the curve. + */ +static void +dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) +{ + dsl_pool_t *dp = tx->tx_pool; + uint64_t delay_min_bytes = + zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; + hrtime_t wakeup, min_tx_time, now; + + if (dirty <= delay_min_bytes) + return; + + /* + * The caller has already waited until we are under the max. + * We make them pass us the amount of dirty data so we don't + * have to handle the case of it being >= the max, which could + * cause a divide-by-zero if it's == the max. + */ + ASSERT3U(dirty, <, zfs_dirty_data_max); + + now = gethrtime(); + min_tx_time = zfs_delay_scale * + (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); + if (now > tx->tx_start + min_tx_time) + return; + + min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); + + DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, + uint64_t, min_tx_time); + + mutex_enter(&dp->dp_lock); + wakeup = MAX(tx->tx_start + min_tx_time, + dp->dp_last_wakeup + min_tx_time); + dp->dp_last_wakeup = wakeup; + mutex_exit(&dp->dp_lock); + +#ifdef _KERNEL + mutex_enter(&curthread->t_delay_lock); + while (cv_timedwait_hires(&curthread->t_delay_cv, + &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns, + CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0) + continue; + mutex_exit(&curthread->t_delay_lock); +#else + hrtime_t delta = wakeup - gethrtime(); + struct timespec ts; + ts.tv_sec = delta / NANOSEC; + ts.tv_nsec = delta % NANOSEC; + (void) nanosleep(&ts, NULL); +#endif +} + static int dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) { @@ -941,6 +1092,12 @@ dmu_tx_try_assign(dmu_tx_t *tx, txg_how_ return (SET_ERROR(ERESTART)); } + if (!tx->tx_waited && + dsl_pool_need_dirty_delay(tx->tx_pool)) { + tx->tx_wait_dirty = B_TRUE; + return (SET_ERROR(ERESTART)); + } + tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); tx->tx_needassign_txh = NULL; @@ -1065,6 +1222,10 @@ dmu_tx_unassign(dmu_tx_t *tx) * blocking, returns immediately with ERESTART. This should be used * whenever you're holding locks. On an ERESTART error, the caller * should drop locks, do a dmu_tx_wait(tx), and try again. + * + * (3) TXG_WAITED. Like TXG_NOWAIT, but indicates that dmu_tx_wait() + * has already been called on behalf of this operation (though + * most likely on a different tx). */ int dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) @@ -1072,12 +1233,16 @@ dmu_tx_assign(dmu_tx_t *tx, txg_how_t tx int err; ASSERT(tx->tx_txg == 0); - ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT); + ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT || + txg_how == TXG_WAITED); ASSERT(!dsl_pool_sync_context(tx->tx_pool)); /* If we might wait, we must not hold the config lock. */ ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool)); + if (txg_how == TXG_WAITED) + tx->tx_waited = B_TRUE; + while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { dmu_tx_unassign(tx); @@ -1096,18 +1261,48 @@ void dmu_tx_wait(dmu_tx_t *tx) { spa_t *spa = tx->tx_pool->dp_spa; + dsl_pool_t *dp = tx->tx_pool; ASSERT(tx->tx_txg == 0); ASSERT(!dsl_pool_config_held(tx->tx_pool)); - /* - * It's possible that the pool has become active after this thread - * has tried to obtain a tx. If that's the case then his - * tx_lasttried_txg would not have been assigned. - */ - if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { - txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1); + if (tx->tx_wait_dirty) { + /* + * dmu_tx_try_assign() has determined that we need to wait + * because we've consumed much or all of the dirty buffer + * space. + */ + mutex_enter(&dp->dp_lock); + while (dp->dp_dirty_total >= zfs_dirty_data_max) + cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); + uint64_t dirty = dp->dp_dirty_total; + mutex_exit(&dp->dp_lock); + + dmu_tx_delay(tx, dirty); + + tx->tx_wait_dirty = B_FALSE; + + /* + * Note: setting tx_waited only has effect if the caller + * used TX_WAIT. Otherwise they are going to destroy + * this tx and try again. The common case, zfs_write(), + * uses TX_WAIT. + */ + tx->tx_waited = B_TRUE; + } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { + /* + * If the pool is suspended we need to wait until it + * is resumed. Note that it's possible that the pool + * has become active after this thread has tried to + * obtain a tx. If that's the case then tx_lasttried_txg + * would not have been set. + */ + txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); } else if (tx->tx_needassign_txh) { + /* + * A dnode is assigned to the quiescing txg. Wait for its + * transaction to complete. + */ dnode_t *dn = tx->tx_needassign_txh->txh_dnode; mutex_enter(&dn->dn_mtx); Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_zfetch.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_zfetch.c Thu Sep 5 16:38:26 2013 (r255254) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_zfetch.c Thu Sep 5 17:52:54 2013 (r255255) @@ -23,6 +23,10 @@ * Use is subject to license terms. */ +/* + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + #include #include #include @@ -287,7 +291,7 @@ dmu_zfetch_fetch(dnode_t *dn, uint64_t b fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks); for (i = 0; i < fetchsz; i++) { - dbuf_prefetch(dn, blkid + i); + dbuf_prefetch(dn, blkid + i, ZIO_PRIORITY_ASYNC_READ); } return (fetchsz); Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c Thu Sep 5 16:38:26 2013 (r255254) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c Thu Sep 5 17:52:54 2013 (r255255) @@ -1788,23 +1788,22 @@ dnode_diduse_space(dnode_t *dn, int64_t } /* - * Call when we think we're going to write/free space in open context. - * Be conservative (ie. OK to write less than this or free more than - * this, but don't write more or free less). + * Call when we think we're going to write/free space in open context to track + * the amount of memory in use by the currently open txg. */ void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) { objset_t *os = dn->dn_objset; dsl_dataset_t *ds = os->os_dsl_dataset; + int64_t aspace = spa_get_asize(os->os_spa, space); - if (space > 0) - space = spa_get_asize(os->os_spa, space); - - if (ds) - dsl_dir_willuse_space(ds->ds_dir, space, tx); + if (ds != NULL) { + dsl_dir_willuse_space(ds->ds_dir, aspace, tx); + dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); + } - dmu_tx_willuse_space(tx, space); + dmu_tx_willuse_space(tx, aspace); } /* Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dir.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dir.c Thu Sep 5 16:38:26 2013 (r255254) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dir.c Thu Sep 5 17:52:54 2013 (r255255) @@ -584,7 +584,6 @@ dsl_dir_space_available(dsl_dir_t *dd, struct tempreserve { list_node_t tr_node; - dsl_pool_t *tr_dp; dsl_dir_t *tr_ds; uint64_t tr_size; }; @@ -735,25 +734,24 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); tr->tr_size = lsize; list_insert_tail(tr_list, tr); - - err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx); } else { if (err == EAGAIN) { + /* + * If arc_memory_throttle() detected that pageout + * is running and we are low on memory, we delay new + * non-pageout transactions to give pageout an + * advantage. + * + * It is unfortunate to be delaying while the caller's + * locks are held. + */ txg_delay(dd->dd_pool, tx->tx_txg, MSEC2NSEC(10), MSEC2NSEC(10)); err = SET_ERROR(ERESTART); } - dsl_pool_memory_pressure(dd->dd_pool); *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***