Date: Sun, 12 Aug 2018 02:24:19 +0000 (UTC) From: Matt Macy <mmacy@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r337676 - head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs Message-ID: <201808120224.w7C2OJRA028363@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: mmacy Date: Sun Aug 12 02:24:18 2018 New Revision: 337676 URL: https://svnweb.freebsd.org/changeset/base/337676 Log: MFV/ZoL: Fix stack dbuf_hold_impl() commit fc5bb51f08a6c91ff9ad3559d0266eeeab0b1f61 Author: Brian Behlendorf <behlendorf1@llnl.gov> Date: Thu Aug 26 10:52:00 2010 -0700 Fix stack dbuf_hold_impl() This commit preserves the recursive function dbuf_hold_impl() but moves the local variables and function arguments to the heap to minimize the stack frame size. Enough space is initially allocated on the stack for 20 levels of recursion. This technique was based on commit 34229a2f2ac07363f64ddd63e014964fff2f0671 which reduced stack usage of traverse_visitbp(). Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c Sun Aug 12 02:12:44 2018 (r337675) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c Sun Aug 12 02:24:18 2018 (r337676) @@ -51,6 +51,30 @@ #include <sys/cityhash.h> #include <sys/spa_impl.h> +struct dbuf_hold_impl_data { + /* Function arguments */ + dnode_t *dh_dn; + uint8_t dh_level; + uint64_t dh_blkid; + boolean_t dh_fail_sparse; + boolean_t dh_fail_uncached; + void *dh_tag; + dmu_buf_impl_t **dh_dbp; + /* Local variables */ + dmu_buf_impl_t *dh_db; + dmu_buf_impl_t *dh_parent; + blkptr_t *dh_bp; + int dh_err; + dbuf_dirty_record_t *dh_dr; + int dh_depth; +}; + +static void __dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, + dnode_t *dn, uint8_t level, uint64_t blkid, boolean_t fail_sparse, + boolean_t fail_uncached, + void *tag, dmu_buf_impl_t **dbp, int depth); +static int __dbuf_hold_impl(struct dbuf_hold_impl_data *dh); + static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); @@ -2202,9 +2226,10 @@ dbuf_destroy(dmu_buf_impl_t *db) * this happens when the dnode is the meta-dnode, or a userused or groupused * object. */ -static int +__attribute__((always_inline)) +static inline int dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, - dmu_buf_impl_t **parentp, blkptr_t **bpp) + dmu_buf_impl_t **parentp, blkptr_t **bpp, struct dbuf_hold_impl_data *dh) { *parentp = NULL; *bpp = NULL; @@ -2257,8 +2282,16 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, in return (SET_ERROR(ENOENT)); } else if (level < nlevels-1) { /* this block is referenced from an indirect block */ - int err = dbuf_hold_impl(dn, level+1, - blkid >> epbs, fail_sparse, FALSE, NULL, parentp); + int err; + if (dh == NULL) { + err = dbuf_hold_impl(dn, level+1, + blkid >> epbs, fail_sparse, FALSE, NULL, parentp); + } else { + __dbuf_hold_impl_init(dh + 1, dn, dh->dh_level + 1, + blkid >> epbs, fail_sparse, FALSE, NULL, + parentp, dh->dh_depth + 1); + err = __dbuf_hold_impl(dh + 1); + } if (err) return (err); err = dbuf_read(*parentp, NULL, @@ -2626,108 +2659,189 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blk zio_nowait(pio); } +#define DBUF_HOLD_IMPL_MAX_DEPTH 20 + /* + * Helper function for __dbuf_hold_impl() to copy a buffer. Handles + * the case of encrypted, compressed and uncompressed buffers by + * allocating the new buffer, respectively, with arc_alloc_raw_buf(), + * arc_alloc_compressed_buf() or arc_alloc_buf().* + * + * NOTE: Declared noinline to avoid stack bloat in __dbuf_hold_impl(). + */ +noinline static void +dbuf_hold_copy(struct dbuf_hold_impl_data *dh) +{ + dnode_t *dn = dh->dh_dn; + dmu_buf_impl_t *db = dh->dh_db; + dbuf_dirty_record_t *dr = dh->dh_dr; + arc_buf_t *data = dr->dt.dl.dr_data; + + enum zio_compress compress_type = arc_get_compression(data); + + if (compress_type != ZIO_COMPRESS_OFF) { + dbuf_set_data(db, arc_alloc_compressed_buf( + dn->dn_objset->os_spa, db, arc_buf_size(data), + arc_buf_lsize(data), compress_type)); + } else { + dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db, + DBUF_GET_BUFC_TYPE(db), db->db.db_size)); + } + + bcopy(data->b_data, db->db.db_data, arc_buf_size(data)); +} + +/* * Returns with db_holds incremented, and db_mtx not held. * Note: dn_struct_rwlock must be held. */ -int -dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, - boolean_t fail_sparse, boolean_t fail_uncached, - void *tag, dmu_buf_impl_t **dbp) +static int +__dbuf_hold_impl(struct dbuf_hold_impl_data *dh) { - dmu_buf_impl_t *db, *parent = NULL; + ASSERT3S(dh->dh_depth, <, DBUF_HOLD_IMPL_MAX_DEPTH); + dh->dh_parent = NULL; - ASSERT(blkid != DMU_BONUS_BLKID); - ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); - ASSERT3U(dn->dn_nlevels, >, level); + ASSERT(dh->dh_blkid != DMU_BONUS_BLKID); + ASSERT(RW_LOCK_HELD(&dh->dh_dn->dn_struct_rwlock)); + ASSERT3U(dh->dh_dn->dn_nlevels, >, dh->dh_level); - *dbp = NULL; -top: + *(dh->dh_dbp) = NULL; + /* dbuf_find() returns with db_mtx held */ - db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); + dh->dh_db = dbuf_find(dh->dh_dn->dn_objset, dh->dh_dn->dn_object, + dh->dh_level, dh->dh_blkid); - if (db == NULL) { - blkptr_t *bp = NULL; - int err; + if (dh->dh_db == NULL) { + dh->dh_bp = NULL; - if (fail_uncached) + if (dh->dh_fail_uncached) return (SET_ERROR(ENOENT)); - ASSERT3P(parent, ==, NULL); - err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); - if (fail_sparse) { - if (err == 0 && bp && BP_IS_HOLE(bp)) - err = SET_ERROR(ENOENT); - if (err) { - if (parent) - dbuf_rele(parent, NULL); - return (err); + ASSERT3P(dh->dh_parent, ==, NULL); + dh->dh_err = dbuf_findbp(dh->dh_dn, dh->dh_level, dh->dh_blkid, + dh->dh_fail_sparse, &dh->dh_parent, &dh->dh_bp, dh); + if (dh->dh_fail_sparse) { + if (dh->dh_err == 0 && + dh->dh_bp && BP_IS_HOLE(dh->dh_bp)) + dh->dh_err = SET_ERROR(ENOENT); + if (dh->dh_err) { + if (dh->dh_parent) + dbuf_rele(dh->dh_parent, NULL); + return (dh->dh_err); } } - if (err && err != ENOENT) - return (err); - db = dbuf_create(dn, level, blkid, parent, bp); + if (dh->dh_err && dh->dh_err != ENOENT) + return (dh->dh_err); + dh->dh_db = dbuf_create(dh->dh_dn, dh->dh_level, dh->dh_blkid, + dh->dh_parent, dh->dh_bp); } - if (fail_uncached && db->db_state != DB_CACHED) { - mutex_exit(&db->db_mtx); + if (dh->dh_fail_uncached && dh->dh_db->db_state != DB_CACHED) { + mutex_exit(&dh->dh_db->db_mtx); return (SET_ERROR(ENOENT)); } - if (db->db_buf != NULL) { - arc_buf_access(db->db_buf); - ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); + if (dh->dh_db->db_buf != NULL) { + arc_buf_access(dh->dh_db->db_buf); + ASSERT3P(dh->dh_db->db.db_data, ==, dh->dh_db->db_buf->b_data); } - ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); + ASSERT(dh->dh_db->db_buf == NULL || arc_referenced(dh->dh_db->db_buf)); /* * If this buffer is currently syncing out, and we are are * still referencing it from db_data, we need to make a copy * of it in case we decide we want to dirty it again in this txg. */ - if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && - dn->dn_object != DMU_META_DNODE_OBJECT && - db->db_state == DB_CACHED && db->db_data_pending) { - dbuf_dirty_record_t *dr = db->db_data_pending; - - if (dr->dt.dl.dr_data == db->db_buf) { - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - - dbuf_set_data(db, - arc_alloc_buf(dn->dn_objset->os_spa, db, type, - db->db.db_size)); - bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, - db->db.db_size); - } + if (dh->dh_db->db_level == 0 && + dh->dh_db->db_blkid != DMU_BONUS_BLKID && + dh->dh_dn->dn_object != DMU_META_DNODE_OBJECT && + dh->dh_db->db_state == DB_CACHED && dh->dh_db->db_data_pending) { + dh->dh_dr = dh->dh_db->db_data_pending; + if (dh->dh_dr->dt.dl.dr_data == dh->dh_db->db_buf) + dbuf_hold_copy(dh); } - if (multilist_link_active(&db->db_cache_link)) { - ASSERT(refcount_is_zero(&db->db_holds)); - ASSERT(db->db_caching_status == DB_DBUF_CACHE || - db->db_caching_status == DB_DBUF_METADATA_CACHE); + if (multilist_link_active(&dh->dh_db->db_cache_link)) { + ASSERT(refcount_is_zero(&dh->dh_db->db_holds)); + ASSERT(dh->dh_db->db_caching_status == DB_DBUF_CACHE || + dh->dh_db->db_caching_status == DB_DBUF_METADATA_CACHE); - multilist_remove(dbuf_caches[db->db_caching_status].cache, db); + multilist_remove( + dbuf_caches[dh->dh_db->db_caching_status].cache, + dh->dh_db); (void) refcount_remove_many( - &dbuf_caches[db->db_caching_status].size, - db->db.db_size, db); + &dbuf_caches[dh->dh_db->db_caching_status].size, + dh->dh_db->db.db_size, dh->dh_db); - db->db_caching_status = DB_NO_CACHE; + dh->dh_db->db_caching_status = DB_NO_CACHE; } - (void) refcount_add(&db->db_holds, tag); - DBUF_VERIFY(db); - mutex_exit(&db->db_mtx); + (void) refcount_add(&dh->dh_db->db_holds, dh->dh_tag); + DBUF_VERIFY(dh->dh_db); + mutex_exit(&dh->dh_db->db_mtx); /* NOTE: we can't rele the parent until after we drop the db_mtx */ - if (parent) - dbuf_rele(parent, NULL); + if (dh->dh_parent) + dbuf_rele(dh->dh_parent, NULL); - ASSERT3P(DB_DNODE(db), ==, dn); - ASSERT3U(db->db_blkid, ==, blkid); - ASSERT3U(db->db_level, ==, level); - *dbp = db; + ASSERT3P(DB_DNODE(dh->dh_db), ==, dh->dh_dn); + ASSERT3U(dh->dh_db->db_blkid, ==, dh->dh_blkid); + ASSERT3U(dh->dh_db->db_level, ==, dh->dh_level); + *(dh->dh_dbp) = dh->dh_db; return (0); +} + +/* + * The following code preserves the recursive function dbuf_hold_impl() + * but moves the local variables AND function arguments to the heap to + * minimize the stack frame size. Enough space is initially allocated + * on the stack for 20 levels of recursion. + */ +int +dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, + boolean_t fail_sparse, boolean_t fail_uncached, + void *tag, dmu_buf_impl_t **dbp) +{ + struct dbuf_hold_impl_data *dh; + int error; + + dh = kmem_alloc(sizeof (struct dbuf_hold_impl_data) * + DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP); + __dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, + fail_uncached, tag, dbp, 0); + + error = __dbuf_hold_impl(dh); + + kmem_free(dh, sizeof (struct dbuf_hold_impl_data) * + DBUF_HOLD_IMPL_MAX_DEPTH); + + return (error); +} + +static void +__dbuf_hold_impl_init(struct dbuf_hold_impl_data *dh, + dnode_t *dn, uint8_t level, uint64_t blkid, + boolean_t fail_sparse, boolean_t fail_uncached, + void *tag, dmu_buf_impl_t **dbp, int depth) +{ + dh->dh_dn = dn; + dh->dh_level = level; + dh->dh_blkid = blkid; + + dh->dh_fail_sparse = fail_sparse; + dh->dh_fail_uncached = fail_uncached; + + dh->dh_tag = tag; + dh->dh_dbp = dbp; + + dh->dh_db = NULL; + dh->dh_parent = NULL; + dh->dh_bp = NULL; + dh->dh_err = 0; + dh->dh_dr = NULL; + + dh->dh_depth = depth; } dmu_buf_impl_t *
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201808120224.w7C2OJRA028363>