Date: Tue, 10 Dec 2013 13:33:56 +0000 (UTC) From: Andriy Gapon <avg@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org Subject: svn commit: r259170 - vendor-sys/illumos/dist/common/zfs vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/sys vendor/illumos/dist/cmd/zdb vendor/illumos/dist/cmd/... Message-ID: <201312101333.rBADXuI5052602@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: avg Date: Tue Dec 10 13:33:56 2013 New Revision: 259170 URL: http://svnweb.freebsd.org/changeset/base/259170 Log: 4370 avoid transmitting holes during zfs send 4371 DMU code clean up illumos/illumos-gate@43466aae47bfcd2ad9bf501faec8e75c08095e4f Modified: vendor-sys/illumos/dist/common/zfs/zfeature_common.c vendor-sys/illumos/dist/common/zfs/zfeature_common.h vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c vendor-sys/illumos/dist/uts/common/fs/zfs/bptree.c vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c vendor-sys/illumos/dist/uts/common/fs/zfs/ddt.c vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_diff.c vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_dataset.c vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_destroy.c vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c vendor-sys/illumos/dist/uts/common/fs/zfs/spa_misc.c vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa_impl.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/vdev.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfeature.h vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_cache.c vendor-sys/illumos/dist/uts/common/fs/zfs/vdev_disk.c vendor-sys/illumos/dist/uts/common/fs/zfs/zfeature.c vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_znode.c vendor-sys/illumos/dist/uts/common/fs/zfs/zil.c vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c vendor-sys/illumos/dist/uts/common/fs/zfs/zvol.c Changes in other areas also in this revision: Modified: vendor/illumos/dist/cmd/zdb/zdb.c vendor/illumos/dist/cmd/zdb/zdb_il.c vendor/illumos/dist/cmd/zhack/zhack.c vendor/illumos/dist/man/man5/zpool-features.5 Modified: vendor-sys/illumos/dist/common/zfs/zfeature_common.c ============================================================================== --- vendor-sys/illumos/dist/common/zfs/zfeature_common.c Tue Dec 10 13:14:54 2013 (r259169) +++ vendor-sys/illumos/dist/common/zfs/zfeature_common.c Tue Dec 10 13:33:56 2013 (r259170) @@ -115,10 +115,21 @@ zfeature_lookup_name(const char *name, s return (ENOENT); } +boolean_t +zfeature_depends_on(spa_feature_t fid, spa_feature_t check) { + zfeature_info_t *feature = &spa_feature_table[fid]; + + for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++) { + if (feature->fi_depends[i] == check) + return (B_TRUE); + } + return (B_FALSE); +} + static void zfeature_register(spa_feature_t fid, const char *guid, const char *name, const char *desc, boolean_t readonly, boolean_t mos, - const spa_feature_t *deps) + boolean_t activate_on_enable, const spa_feature_t *deps) { zfeature_info_t *feature = &spa_feature_table[fid]; static spa_feature_t nodeps[] = { SPA_FEATURE_NONE }; @@ -138,6 +149,7 @@ zfeature_register(spa_feature_t fid, con feature->fi_desc = desc; feature->fi_can_readonly = readonly; feature->fi_mos = mos; + feature->fi_activate_on_enable = activate_on_enable; feature->fi_depends = deps; } @@ -146,21 +158,43 @@ zpool_feature_init(void) { zfeature_register(SPA_FEATURE_ASYNC_DESTROY, "com.delphix:async_destroy", "async_destroy", - "Destroy filesystems asynchronously.", B_TRUE, B_FALSE, NULL); + "Destroy filesystems asynchronously.", B_TRUE, B_FALSE, + B_FALSE, NULL); + zfeature_register(SPA_FEATURE_EMPTY_BPOBJ, "com.delphix:empty_bpobj", "empty_bpobj", - "Snapshots use less space.", B_TRUE, B_FALSE, NULL); + "Snapshots use less space.", B_TRUE, B_FALSE, + B_FALSE, NULL); + zfeature_register(SPA_FEATURE_LZ4_COMPRESS, "org.illumos:lz4_compress", "lz4_compress", - "LZ4 compression algorithm support.", B_FALSE, B_FALSE, NULL); + "LZ4 compression algorithm support.", B_FALSE, B_FALSE, + B_FALSE, NULL); + zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, "com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump", - "Crash dumps to multiple vdev pools.", B_FALSE, B_FALSE, NULL); + "Crash dumps to multiple vdev pools.", B_FALSE, B_FALSE, + B_FALSE, NULL); + zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM, "com.delphix:spacemap_histogram", "spacemap_histogram", - "Spacemaps maintain space histograms.", B_TRUE, B_FALSE, NULL); + "Spacemaps maintain space histograms.", B_TRUE, B_FALSE, + B_FALSE, NULL); + + zfeature_register(SPA_FEATURE_ENABLED_TXG, + "com.delphix:enabled_txg", "enabled_txg", + "Record txg at which a feature is enabled", B_TRUE, B_FALSE, + B_FALSE, NULL); + + static spa_feature_t hole_birth_deps[] = { SPA_FEATURE_ENABLED_TXG, + SPA_FEATURE_NONE }; + zfeature_register(SPA_FEATURE_HOLE_BIRTH, + "com.delphix:hole_birth", "hole_birth", + "Retain hole birth txg for more precise zfs send", + B_FALSE, B_TRUE, B_TRUE, hole_birth_deps); + zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET, "com.delphix:extensible_dataset", "extensible_dataset", "Enhanced dataset functionality, used by other features.", - B_FALSE, B_FALSE, NULL); + B_FALSE, B_FALSE, B_FALSE, NULL); } Modified: vendor-sys/illumos/dist/common/zfs/zfeature_common.h ============================================================================== --- vendor-sys/illumos/dist/common/zfs/zfeature_common.h Tue Dec 10 13:14:54 2013 (r259169) +++ vendor-sys/illumos/dist/common/zfs/zfeature_common.h Tue Dec 10 13:33:56 2013 (r259170) @@ -45,10 +45,14 @@ typedef enum spa_feature { SPA_FEATURE_LZ4_COMPRESS, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, SPA_FEATURE_SPACEMAP_HISTOGRAM, + SPA_FEATURE_ENABLED_TXG, + SPA_FEATURE_HOLE_BIRTH, SPA_FEATURE_EXTENSIBLE_DATASET, SPA_FEATURES } spa_feature_t; +#define SPA_FEATURE_DISABLED (-1ULL) + typedef struct zfeature_info { spa_feature_t fi_feature; const char *fi_uname; /* User-facing feature name */ @@ -56,6 +60,8 @@ typedef struct zfeature_info { const char *fi_desc; /* Feature description */ boolean_t fi_can_readonly; /* Can open pool readonly w/o support? */ boolean_t fi_mos; /* Is the feature necessary to read the MOS? */ + /* Activate this feature at the same time it is enabled */ + boolean_t fi_activate_on_enable; /* array of dependencies, terminated by SPA_FEATURE_NONE */ const spa_feature_t *fi_depends; } zfeature_info_t; @@ -70,6 +76,7 @@ extern boolean_t zfeature_is_valid_guid( extern boolean_t zfeature_is_supported(const char *); extern int zfeature_lookup_name(const char *name, spa_feature_t *res); +extern boolean_t zfeature_depends_on(spa_feature_t fid, spa_feature_t check); extern void zpool_feature_init(void); Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c Tue Dec 10 13:14:54 2013 (r259169) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c Tue Dec 10 13:33:56 2013 (r259170) @@ -727,7 +727,7 @@ buf_hash(uint64_t spa, const dva_t *dva, #define BUF_EMPTY(buf) \ ((buf)->b_dva.dva_word[0] == 0 && \ (buf)->b_dva.dva_word[1] == 0 && \ - (buf)->b_birth == 0) + (buf)->b_cksum0 == 0) #define BUF_EQUAL(spa, dva, birth, buf) \ ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ @@ -3509,9 +3509,13 @@ arc_write_done(zio_t *zio) ASSERT(hdr->b_acb == NULL); if (zio->io_error == 0) { - hdr->b_dva = *BP_IDENTITY(zio->io_bp); - hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); - hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; + if (BP_IS_HOLE(zio->io_bp)) { + buf_discard_identity(hdr); + } else { + hdr->b_dva = *BP_IDENTITY(zio->io_bp); + hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); + hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; + } } else { ASSERT(BUF_EMPTY(hdr)); } Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/bptree.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/bptree.c Tue Dec 10 13:14:54 2013 (r259169) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/bptree.c Tue Dec 10 13:33:56 2013 (r259170) @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. */ #include <sys/arc.h> @@ -141,7 +141,7 @@ bptree_visit_cb(spa_t *spa, zilog_t *zil int err; struct bptree_args *ba = arg; - if (bp == NULL) + if (BP_IS_HOLE(bp)) return (0); err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c Tue Dec 10 13:14:54 2013 (r259169) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c Tue Dec 10 13:33:56 2013 (r259170) @@ -455,10 +455,9 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db) mutex_enter(&db->db_mtx); if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { int blksz = db->db.db_size; - spa_t *spa; + spa_t *spa = db->db_objset->os_spa; mutex_exit(&db->db_mtx); - DB_GET_SPA(&spa, db); abuf = arc_loan_buf(spa, blksz); bcopy(db->db.db_data, abuf->b_data, blksz); } else { @@ -519,7 +518,6 @@ static void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) { dnode_t *dn; - spa_t *spa; zbookmark_t zb; uint32_t aflags = ARC_NOWAIT; @@ -559,9 +557,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t BP_IS_HOLE(db->db_blkptr)))) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, - db->db.db_size, db, type)); DB_DNODE_EXIT(db); + dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, + db->db.db_size, db, type)); bzero(db->db.db_data, db->db.db_size); db->db_state = DB_CACHED; *flags |= DB_RF_CACHED; @@ -569,7 +567,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t return; } - spa = dn->dn_objset->os_spa; DB_DNODE_EXIT(db); db->db_state = DB_READ; @@ -586,7 +583,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t dbuf_add_ref(db, NULL); - (void) arc_read(zio, spa, db->db_blkptr, + (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); @@ -598,8 +595,8 @@ int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { int err = 0; - int havepzio = (zio != NULL); - int prefetch; + boolean_t havepzio = (zio != NULL); + boolean_t prefetch; dnode_t *dn; /* @@ -694,11 +691,10 @@ dbuf_noread(dmu_buf_impl_t *db) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - spa_t *spa; + spa_t *spa = db->db_objset->os_spa; ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); - DB_GET_SPA(&spa, db); dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { @@ -753,9 +749,8 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, ui } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - spa_t *spa; + spa_t *spa = db->db_objset->os_spa; - DB_GET_SPA(&spa, db); dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { @@ -781,12 +776,9 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) ASSERT(db->db_data_pending != dr); /* free this block */ - if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) { - spa_t *spa; + if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) + zio_free(db->db_objset->os_spa, txg, bp); - DB_GET_SPA(&spa, db); - zio_free(spa, txg, bp); - } dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_nopwrite = B_FALSE; @@ -804,9 +796,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) /* * Evict (if its unreferenced) or clear (if its referenced) any level-0 * data blocks in the free range, so that any future readers will find - * empty blocks. Also, if we happen across any level-1 dbufs in the - * range that have not already been marked dirty, mark them dirty so - * they stay in memory. + * empty blocks. * * This is a no-op if the dataset is in the middle of an incremental * receive; see comment below for details. @@ -816,14 +806,9 @@ dbuf_free_range(dnode_t *dn, uint64_t st { dmu_buf_impl_t *db, *db_next; uint64_t txg = tx->tx_txg; - int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; - uint64_t first_l1 = start >> epbs; - uint64_t last_l1 = end >> epbs; - if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) { + if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) end = dn->dn_maxblkid; - last_l1 = end >> epbs; - } dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); mutex_enter(&dn->dn_dbufs_mtx); @@ -846,23 +831,8 @@ dbuf_free_range(dnode_t *dn, uint64_t st db_next = list_next(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DMU_BONUS_BLKID); - if (db->db_level == 1 && - db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { - mutex_enter(&db->db_mtx); - if (db->db_last_dirty && - db->db_last_dirty->dr_txg < txg) { - dbuf_add_ref(db, FTAG); - mutex_exit(&db->db_mtx); - dbuf_will_dirty(db, tx); - dbuf_rele(db, FTAG); - } else { - mutex_exit(&db->db_mtx); - } - } - if (db->db_level != 0) continue; - dprintf_dbuf(db, "found buf %s\n", ""); if (db->db_blkid < start || db->db_blkid > end) continue; @@ -939,24 +909,29 @@ dbuf_block_freeable(dmu_buf_impl_t *db) * We don't need any locking to protect db_blkptr: * If it's syncing, then db_last_dirty will be set * so we'll ignore db_blkptr. + * + * This logic ensures that only block births for + * filled blocks are considered. */ ASSERT(MUTEX_HELD(&db->db_mtx)); - if (db->db_last_dirty) + if (db->db_last_dirty && (db->db_blkptr == NULL || + !BP_IS_HOLE(db->db_blkptr))) { birth_txg = db->db_last_dirty->dr_txg; - else if (db->db_blkptr) + } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { birth_txg = db->db_blkptr->blk_birth; + } /* - * If we don't exist or are in a snapshot, we can't be freed. + * If this block don't exist or is in a snapshot, it can't be freed. * Don't pass the bp to dsl_dataset_block_freeable() since we * are holding the db_mtx lock and might deadlock if we are * prefetching a dedup-ed block. */ - if (birth_txg) + if (birth_txg != 0) return (ds == NULL || dsl_dataset_block_freeable(ds, NULL, birth_txg)); else - return (FALSE); + return (B_FALSE); } void @@ -976,7 +951,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int si ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); /* - * This call to dbuf_will_dirty() with the dn_struct_rwlock held + * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held * is OK, because there can be no other references to the db * when we are changing its size, so no concurrent DB_FILL can * be happening. @@ -985,7 +960,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int si * XXX we should be doing a dbuf_read, checking the return * value and returning that up to our callers */ - dbuf_will_dirty(db, tx); + dmu_buf_will_dirty(&db->db, tx); /* create the data buffer for the new block */ buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); @@ -1015,9 +990,8 @@ dbuf_new_size(dmu_buf_impl_t *db, int si void dbuf_release_bp(dmu_buf_impl_t *db) { - objset_t *os; + objset_t *os = db->db_objset; - DB_GET_OBJSET(&os, db); ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); ASSERT(arc_released(os->os_phys_buf) || list_link_active(&os->os_dsl_dataset->ds_synced_link)); @@ -1391,10 +1365,10 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_ return (B_FALSE); } -#pragma weak dmu_buf_will_dirty = dbuf_will_dirty void -dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; ASSERT(tx->tx_txg != 0); @@ -1517,7 +1491,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, a db->db_state = DB_FILL; mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); - dbuf_fill_done(db, tx); + dmu_buf_fill_done(&db->db, tx); } /* @@ -2022,7 +1996,6 @@ dbuf_add_ref(dmu_buf_impl_t *db, void *t * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the * dnode's parent dbuf evicting its dnode handles. */ -#pragma weak dmu_buf_rele = dbuf_rele void dbuf_rele(dmu_buf_impl_t *db, void *tag) { @@ -2030,6 +2003,12 @@ dbuf_rele(dmu_buf_impl_t *db, void *tag) dbuf_rele_and_unlock(db, tag); } +void +dmu_buf_rele(dmu_buf_t *db, void *tag) +{ + dbuf_rele((dmu_buf_impl_t *)db, tag); +} + /* * dbuf_rele() for an already-locked dbuf. This is necessary to allow * db_dirtycnt and db_holds to be updated atomically. @@ -2476,18 +2455,14 @@ dbuf_write_ready(zio_t *zio, arc_buf_t * dnode_diduse_space(dn, delta - zio->io_prev_space_delta); zio->io_prev_space_delta = delta; - if (BP_IS_HOLE(bp)) { - ASSERT(bp->blk_fill == 0); - DB_DNODE_EXIT(db); - return; + if (bp->blk_birth != 0) { + ASSERT((db->db_blkid != DMU_SPILL_BLKID && + BP_GET_TYPE(bp) == dn->dn_type) || + (db->db_blkid == DMU_SPILL_BLKID && + BP_GET_TYPE(bp) == dn->dn_bonustype)); + ASSERT(BP_GET_LEVEL(bp) == db->db_level); } - ASSERT((db->db_blkid != DMU_SPILL_BLKID && - BP_GET_TYPE(bp) == dn->dn_type) || - (db->db_blkid == DMU_SPILL_BLKID && - BP_GET_TYPE(bp) == dn->dn_bonustype)); - ASSERT(BP_GET_LEVEL(bp) == db->db_level); - mutex_enter(&db->db_mtx); #ifdef ZFS_DEBUG @@ -2513,7 +2488,11 @@ dbuf_write_ready(zio_t *zio, arc_buf_t * fill++; } } else { - fill = 1; + if (BP_IS_HOLE(bp)) { + fill = 0; + } else { + fill = 1; + } } } else { blkptr_t *ibp = db->db.db_data; @@ -2568,9 +2547,10 @@ static void dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; - blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; - uint64_t txg = zio->io_txg; + blkptr_t *bp = db->db_blkptr; + objset_t *os = db->db_objset; + dmu_tx_t *tx = os->os_synctx; dbuf_dirty_record_t **drp, *dr; ASSERT0(zio->io_error); @@ -2583,14 +2563,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *b if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { - objset_t *os; - dsl_dataset_t *ds; - dmu_tx_t *tx; - - DB_GET_OBJSET(&os, db); - ds = os->os_dsl_dataset; - tx = os->os_synctx; - + dsl_dataset_t *ds = os->os_dsl_dataset; (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); } @@ -2603,7 +2576,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *b while ((dr = *drp) != db->db_data_pending) drp = &dr->dr_next; ASSERT(!list_link_active(&dr->dr_dirty_node)); - ASSERT(dr->dr_txg == txg); ASSERT(dr->dr_dbuf == db); ASSERT(dr->dr_next == NULL); *drp = dr->dr_next; @@ -2637,14 +2609,14 @@ dbuf_write_done(zio_t *zio, arc_buf_t *b DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); - ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); + ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); if (!BP_IS_HOLE(db->db_blkptr)) { int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + ASSERT3U(db->db_blkid, <=, + dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); - ASSERT3U(dn->dn_phys->dn_maxblkid - >> (db->db_level * epbs), >=, db->db_blkid); arc_set_callback(db->db_buf, dbuf_do_evict, db); } DB_DNODE_EXIT(db); @@ -2657,8 +2629,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *b ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; - - dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); } static void Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/ddt.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/ddt.c Tue Dec 10 13:14:54 2013 (r259169) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/ddt.c Tue Dec 10 13:33:56 2013 (r259170) @@ -111,12 +111,12 @@ ddt_object_load(ddt_t *ddt, enum ddt_typ error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); - if (error) + if (error != 0) return (error); - error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, + VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), - &ddt->ddt_histogram[type][class]); + &ddt->ddt_histogram[type][class])); /* * Seed the cached statistics. @@ -127,8 +127,7 @@ ddt_object_load(ddt_t *ddt, enum ddt_typ ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; - ASSERT(error == 0); - return (error); + return (0); } static void @@ -581,7 +580,10 @@ ddt_compress(void *src, uchar_t *dst, si bcopy(src, dst, s_len); } - *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc; + *version = cpfunc; + /* CONSTCOND */ + if (ZFS_HOST_BYTEORDER) + *version |= DDT_COMPRESS_BYTEORDER_MASK; return (c_len + 1); } @@ -598,7 +600,8 @@ ddt_decompress(uchar_t *src, void *dst, else bcopy(src, dst, d_len); - if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK) + if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) != + (ZFS_HOST_BYTEORDER != 0)) byteswap_uint64_array(dst, d_len); } Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c Tue Dec 10 13:14:54 2013 (r259169) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c Tue Dec 10 13:33:56 2013 (r259170) @@ -680,7 +680,7 @@ dmu_free_long_range(objset_t *os, uint64 * will take the fast path, and (b) dnode_reallocate() can verify * that the entire file has been freed. */ - if (offset == 0 && length == DMU_OBJECT_END) + if (err == 0 && offset == 0 && length == DMU_OBJECT_END) dn->dn_maxblkid = 0; dnode_rele(dn, FTAG); @@ -1184,10 +1184,8 @@ arc_buf_t * dmu_request_arcbuf(dmu_buf_t *handle, int size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; - spa_t *spa; - DB_GET_SPA(&spa, db); - return (arc_loan_buf(spa, size)); + return (arc_loan_buf(db->db_objset->os_spa, size)); } /* Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_diff.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_diff.c Tue Dec 10 13:14:54 2013 (r259169) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_diff.c Tue Dec 10 13:33:56 2013 (r259170) @@ -118,7 +118,7 @@ diff_cb(spa_t *spa, zilog_t *zilog, cons if (zb->zb_object != DMU_META_DNODE_OBJECT) return (0); - if (bp == NULL) { + if (BP_IS_HOLE(bp)) { uint64_t span = DBP_SPAN(dnp, zb->zb_level); uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c Tue Dec 10 13:14:54 2013 (r259169) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c Tue Dec 10 13:33:56 2013 (r259170) @@ -356,11 +356,12 @@ backup_cb(spa_t *spa, zilog_t *zilog, co if (zb->zb_object != DMU_META_DNODE_OBJECT && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { return (0); - } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) { + } else if (BP_IS_HOLE(bp) && + zb->zb_object == DMU_META_DNODE_OBJECT) { uint64_t span = BP_SPAN(dnp, zb->zb_level); uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); - } else if (bp == NULL) { + } else if (BP_IS_HOLE(bp)) { uint64_t span = BP_SPAN(dnp, zb->zb_level); err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c Tue Dec 10 13:14:54 2013 (r259169) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c Tue Dec 10 13:33:56 2013 (r259170) @@ -36,6 +36,7 @@ #include <sys/sa.h> #include <sys/sa_impl.h> #include <sys/callb.h> +#include <sys/zfeature.h> int zfs_pd_blks_max = 100; @@ -72,7 +73,7 @@ traverse_zil_block(zilog_t *zilog, blkpt traverse_data_t *td = arg; zbookmark_t zb; - if (bp->blk_birth == 0) + if (BP_IS_HOLE(bp)) return (0); if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa)) @@ -96,7 +97,7 @@ traverse_zil_record(zilog_t *zilog, lr_t blkptr_t *bp = &lr->lr_blkptr; zbookmark_t zb; - if (bp->blk_birth == 0) + if (BP_IS_HOLE(bp)) return (0); if (claim_txg == 0 || bp->blk_birth < claim_txg) @@ -226,14 +227,35 @@ traverse_visitbp(traverse_data_t *td, co ASSERT(0); } + if (bp->blk_birth == 0) { + if (spa_feature_is_active(td->td_spa, SPA_FEATURE_HOLE_BIRTH)) { + /* + * Since this block has a birth time of 0 it must be a + * hole created before the SPA_FEATURE_HOLE_BIRTH + * feature was enabled. If SPA_FEATURE_HOLE_BIRTH + * was enabled before the min_txg for this traveral we + * know the hole must have been created before the + * min_txg for this traveral, so we can skip it. If + * SPA_FEATURE_HOLE_BIRTH was enabled after the min_txg + * for this traveral we cannot tell if the hole was + * created before or after the min_txg for this + * traversal, so we cannot skip it. + */ + uint64_t hole_birth_enabled_txg; + VERIFY(spa_feature_enabled_txg(td->td_spa, + SPA_FEATURE_HOLE_BIRTH, &hole_birth_enabled_txg)); + if (hole_birth_enabled_txg < td->td_min_txg) + return (0); + } + } else if (bp->blk_birth <= td->td_min_txg) { + return (0); + } + if (BP_IS_HOLE(bp)) { - err = td->td_func(td->td_spa, NULL, NULL, zb, dnp, td->td_arg); + err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); return (err); } - if (bp->blk_birth <= td->td_min_txg) - return (0); - if (pd && !pd->pd_exited && ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) || BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) { @@ -436,7 +458,8 @@ traverse_prefetcher(spa_t *spa, zilog_t if (pfd->pd_cancel) return (SET_ERROR(EINTR)); - if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) || + if (BP_IS_HOLE(bp) || + !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) || BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) || BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) return (0); Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c Tue Dec 10 13:14:54 2013 (r259169) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c Tue Dec 10 13:33:56 2013 (r259170) @@ -1544,7 +1544,13 @@ dnode_free_range(dnode_t *dn, uint64_t o } else { ASSERT(dn->dn_maxblkid == 0); if (off == 0 && len >= blksz) { - /* Freeing the whole block; fast-track this request */ + /* + * Freeing the whole block; fast-track this request. + * Note that we won't dirty any indirect blocks, + * which is fine because we will be freeing the entire + * file and thus all indirect blocks will be freed + * by free_children(). + */ blkid = 0; nblks = 1; goto done; @@ -1571,7 +1577,7 @@ dnode_free_range(dnode_t *dn, uint64_t o if (db->db_last_dirty || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); - dbuf_will_dirty(db, tx); + dmu_buf_will_dirty(&db->db, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); data = db->db.db_data; bzero(data + blkoff, head); @@ -1607,7 +1613,7 @@ dnode_free_range(dnode_t *dn, uint64_t o if (db->db_last_dirty || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); - dbuf_will_dirty(db, tx); + dmu_buf_will_dirty(&db->db, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); bzero(db->db.db_data, tail); } @@ -1628,18 +1634,18 @@ dnode_free_range(dnode_t *dn, uint64_t o nblks += 1; /* - * Read in and mark all the level-1 indirects dirty, - * so that they will stay in memory until syncing phase. - * Always dirty the first and last indirect to make sure - * we dirty all the partial indirects. + * Dirty the first and last indirect blocks, as they (and/or their + * parents) will need to be written out if they were only + * partially freed. Interior indirect blocks will be themselves freed, + * by free_children(), so they need not be dirtied. Note that these + * interior blocks have already been prefetched by dmu_tx_hold_free(). */ if (dn->dn_nlevels > 1) { - uint64_t i, first, last; - int shift = epbs + dn->dn_datablkshift; + uint64_t first, last; first = blkid >> epbs; if (db = dbuf_hold_level(dn, 1, first, FTAG)) { - dbuf_will_dirty(db, tx); + dmu_buf_will_dirty(&db->db, tx); dbuf_rele(db, FTAG); } if (trunc) @@ -1647,26 +1653,11 @@ dnode_free_range(dnode_t *dn, uint64_t o else last = (blkid + nblks - 1) >> epbs; if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) { - dbuf_will_dirty(db, tx); + dmu_buf_will_dirty(&db->db, tx); dbuf_rele(db, FTAG); } - for (i = first + 1; i < last; i++) { - uint64_t ibyte = i << shift; - int err; - - err = dnode_next_offset(dn, - DNODE_FIND_HAVELOCK, &ibyte, 1, 1, 0); - i = ibyte >> shift; - if (err == ESRCH || i >= last) - break; - ASSERT(err == 0); - db = dbuf_hold_level(dn, 1, i, FTAG); - if (db) { - dbuf_will_dirty(db, tx); - dbuf_rele(db, FTAG); - } - } } + done: /* * Add this range to the dnode range list. @@ -1694,8 +1685,6 @@ done: dbuf_free_range(dn, blkid, blkid + nblks - 1, tx); dnode_setdirty(dn, tx); out: - if (trunc && dn->dn_maxblkid >= (off >> blkshift)) - dn->dn_maxblkid = (off >> blkshift ? (off >> blkshift) - 1 : 0); rw_exit(&dn->dn_struct_rwlock); } @@ -1872,8 +1861,10 @@ dnode_next_offset_level(dnode_t *dn, int data = db->db.db_data; } - if (db && txg && - (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) { + + if (db != NULL && txg != 0 && (db->db_blkptr == NULL || + db->db_blkptr->blk_birth <= txg || + BP_IS_HOLE(db->db_blkptr))) { /* * This can only happen when we are searching up the tree * and these conditions mean that we need to keep climbing. Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c Tue Dec 10 13:14:54 2013 (r259169) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c Tue Dec 10 13:33:56 2013 (r259170) @@ -32,6 +32,7 @@ #include <sys/dmu_objset.h> #include <sys/dsl_dataset.h> #include <sys/spa.h> +#include <sys/zfeature.h> static void dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) @@ -112,26 +113,44 @@ dnode_increase_indirection(dnode_t *dn, rw_exit(&dn->dn_struct_rwlock); } -static int +static void free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) { dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; uint64_t bytesfreed = 0; - int i, blocks_freed = 0; dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num); - for (i = 0; i < num; i++, bp++) { + for (int i = 0; i < num; i++, bp++) { if (BP_IS_HOLE(bp)) continue; bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE); ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys)); + + /* + * Save some useful information on the holes being + * punched, including logical size, type, and indirection + * level. Retaining birth time enables detection of when + * holes are punched for reducing the number of free + * records transmitted during a zfs send. + */ + + uint64_t lsize = BP_GET_LSIZE(bp); + dmu_object_type_t type = BP_GET_TYPE(bp); + uint64_t lvl = BP_GET_LEVEL(bp); + bzero(bp, sizeof (blkptr_t)); - blocks_freed += 1; + + if (spa_feature_is_active(dn->dn_objset->os_spa, + SPA_FEATURE_HOLE_BIRTH)) { + BP_SET_LSIZE(bp, lsize); + BP_SET_TYPE(bp, type); + BP_SET_LEVEL(bp, lvl); + BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0); + } } dnode_diduse_space(dn, -bytesfreed); - return (blocks_freed); } #ifdef ZFS_DEBUG @@ -215,30 +234,27 @@ free_verify(dmu_buf_impl_t *db, uint64_t #define ALL -1 -static int -free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, +static void +free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) { dnode_t *dn; blkptr_t *bp; dmu_buf_impl_t *subdb; uint64_t start, end, dbstart, dbend, i; - int epbs, shift, err; - int all = TRUE; - int blocks_freed = 0; + int epbs, shift; /* * There is a small possibility that this block will not be cached: * 1 - if level > 1 and there are no children with level <= 1 - * 2 - if we didn't get a dirty hold (because this block had just - * finished being written -- and so had no holds), and then this - * block got evicted before we got here. + * 2 - if this block was evicted since we read it from + * dmu_tx_hold_free(). */ if (db->db_state != DB_CACHED) (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); dbuf_release_bp(db); - bp = (blkptr_t *)db->db.db_data; + bp = db->db.db_data; DB_DNODE_ENTER(db); dn = DB_DNODE(db); @@ -248,7 +264,6 @@ free_children(dmu_buf_impl_t *db, uint64 start = blkid >> shift; if (dbstart < start) { bp += start - dbstart; - all = FALSE; } else { start = dbstart; } @@ -256,49 +271,46 @@ free_children(dmu_buf_impl_t *db, uint64 end = (blkid + nblks - 1) >> shift; if (dbend <= end) end = dbend; - else if (all) - all = trunc; + ASSERT3U(start, <=, end); if (db->db_level == 1) { FREE_VERIFY(db, start, end, tx); - blocks_freed = free_blocks(dn, bp, end-start+1, tx); - arc_buf_freeze(db->db_buf); - ASSERT(all || blocks_freed == 0 || db->db_last_dirty); - DB_DNODE_EXIT(db); - return (all ? ALL : blocks_freed); + free_blocks(dn, bp, end-start+1, tx); + } else { + for (i = start; i <= end; i++, bp++) { + if (BP_IS_HOLE(bp)) + continue; + rw_enter(&dn->dn_struct_rwlock, RW_READER); + VERIFY0(dbuf_hold_impl(dn, db->db_level - 1, + i, B_TRUE, FTAG, &subdb)); + rw_exit(&dn->dn_struct_rwlock); + ASSERT3P(bp, ==, subdb->db_blkptr); + + free_children(subdb, blkid, nblks, tx); + dbuf_rele(subdb, FTAG); + } } - for (i = start; i <= end; i++, bp++) { - if (BP_IS_HOLE(bp)) - continue; - rw_enter(&dn->dn_struct_rwlock, RW_READER); - err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb); - ASSERT0(err); - rw_exit(&dn->dn_struct_rwlock); - - if (free_children(subdb, blkid, nblks, trunc, tx) == ALL) { - ASSERT3P(subdb->db_blkptr, ==, bp); - blocks_freed += free_blocks(dn, bp, 1, tx); - } else { - all = FALSE; - } - dbuf_rele(subdb, FTAG); + /* If this whole block is free, free ourself too. */ + for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) { + if (!BP_IS_HOLE(bp)) + break; } + if (i == 1 << epbs) { + /* didn't find any non-holes */ + bzero(db->db.db_data, db->db.db_size); + free_blocks(dn, db->db_blkptr, 1, tx); + } else { + /* + * Partial block free; must be marked dirty so that it + * will be written out. + */ + ASSERT(db->db_dirtycnt > 0); + } + DB_DNODE_EXIT(db); arc_buf_freeze(db->db_buf); -#ifdef ZFS_DEBUG - bp -= (end-start)+1; - for (i = start; i <= end; i++, bp++) { - if (i == start && blkid != 0) - continue; - else if (i == end && !trunc) - continue; - ASSERT0(bp->blk_birth); - } -#endif - ASSERT(all || blocks_freed == 0 || db->db_last_dirty); *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201312101333.rBADXuI5052602>