Date: Tue, 30 Jul 2013 21:35:02 +0000 (UTC) From: Xin LI <delphij@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r253821 - in head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs: . sys Message-ID: <201307302135.r6ULZ2tu079748@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: delphij Date: Tue Jul 30 21:35:02 2013 New Revision: 253821 URL: http://svnweb.freebsd.org/changeset/base/253821 Log: MFV r253783: Skip eviction step of processing free records when doing ZFS receive to avoid the expensive search operation of non-existent dbufs in dn_dbufs. Illumos ZFS issues: 3834 incremental replication of 'holey' file systems is slow MFC after: 2 weeks Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h Directory Properties: head/sys/cddl/contrib/opensolaris/ (props changed) Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c Tue Jul 30 21:20:12 2013 (r253820) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c Tue Jul 30 21:35:02 2013 (r253821) @@ -27,6 +27,7 @@ #include <sys/zfs_context.h> #include <sys/dmu.h> +#include <sys/dmu_send.h> #include <sys/dmu_impl.h> #include <sys/dbuf.h> #include <sys/dmu_objset.h> @@ -796,9 +797,12 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) /* * Evict (if its unreferenced) or clear (if its referenced) any level-0 * data blocks in the free range, so that any future readers will find - * empty blocks. Also, if we happen accross any level-1 dbufs in the + * empty blocks. Also, if we happen across any level-1 dbufs in the * range that have not already been marked dirty, mark them dirty so * they stay in memory. + * + * This is a no-op if the dataset is in the middle of an incremental + * receive; see comment below for details. */ void dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) @@ -814,6 +818,20 @@ dbuf_free_range(dnode_t *dn, uint64_t st last_l1 = end >> epbs; } dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); + + if (dmu_objset_is_receiving(dn->dn_objset)) { + /* + * When processing a free record from a zfs receive, + * there should have been no previous modifications to the + * data in this range. Therefore there should be no dbufs + * in the range. Searching dn_dbufs for these non-existent + * dbufs can be very expensive, so simply ignore this. + */ + VERIFY3P(dbuf_find(dn, 0, start), ==, NULL); + VERIFY3P(dbuf_find(dn, 0, end), ==, NULL); + return; + } + mutex_enter(&dn->dn_dbufs_mtx); for (db = list_head(&dn->dn_dbufs); db; db = db_next) { db_next = list_next(&dn->dn_dbufs, db); Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c Tue Jul 30 21:20:12 2013 (r253820) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c Tue Jul 30 21:35:02 2013 (r253821) @@ -96,6 +96,32 @@ dump_free(dmu_sendarg_t *dsp, uint64_t o { struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); + /* + * When we receive a free record, dbuf_free_range() assumes + * that the receiving system doesn't have any dbufs in the range + * being freed. This is always true because there is a one-record + * constraint: we only send one WRITE record for any given + * object+offset. We know that the one-record constraint is + * true because we always send data in increasing order by + * object,offset. + * + * If the increasing-order constraint ever changes, we should find + * another way to assert that the one-record constraint is still + * satisfied. + */ + ASSERT(object > dsp->dsa_last_data_object || + (object == dsp->dsa_last_data_object && + offset > dsp->dsa_last_data_offset)); + + /* + * If we are doing a non-incremental send, then there can't + * be any data in the dataset we're receiving into. Therefore + * a free record would simply be a no-op. Save space by not + * sending it to begin with. + */ + if (!dsp->dsa_incremental) + return (0); + if (length != -1ULL && offset + length < offset) length = -1ULL; @@ -162,6 +188,15 @@ dump_data(dmu_sendarg_t *dsp, dmu_object { struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); + /* + * We send data in increasing object, offset order. + * See comment in dump_free() for details. + */ + ASSERT(object > dsp->dsa_last_data_object || + (object == dsp->dsa_last_data_object && + offset > dsp->dsa_last_data_offset)); + dsp->dsa_last_data_object = object; + dsp->dsa_last_data_offset = offset + blksz - 1; /* * If there is any kind of pending aggregation (currently either @@ -229,6 +264,10 @@ dump_freeobjects(dmu_sendarg_t *dsp, uin { struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); + /* See comment in dump_free(). */ + if (!dsp->dsa_incremental) + return (0); + /* * If there is a pending op, but it's not PENDING_FREEOBJECTS, * push it out, since free block aggregation can only be done for @@ -305,9 +344,9 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) return (SET_ERROR(EINTR)); - /* free anything past the end of the file */ + /* Free anything past the end of the file. */ if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL)) + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0) return (SET_ERROR(EINTR)); if (dsp->dsa_err != 0) return (SET_ERROR(EINTR)); @@ -495,6 +534,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsp->dsa_toguid = ds->ds_phys->ds_guid; ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); dsp->dsa_pending_op = PENDING_NONE; + dsp->dsa_incremental = (fromtxg != 0); mutex_enter(&ds->ds_sendstream_lock); list_insert_head(&ds->ds_sendstreams, dsp); @@ -1819,3 +1859,13 @@ dmu_recv_end(dmu_recv_cookie_t *drc, voi else return (dmu_recv_existing_end(drc)); } + +/* + * Return TRUE if this objset is currently being received into. + */ +boolean_t +dmu_objset_is_receiving(objset_t *os) +{ + return (os->os_dsl_dataset != NULL && + os->os_dsl_dataset->ds_owner == dmu_recv_tag); +} Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c Tue Jul 30 21:20:12 2013 (r253820) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c Tue Jul 30 21:35:02 2013 (r253821) @@ -587,8 +587,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t { dmu_tx_hold_t *txh; dnode_t *dn; - uint64_t start, end, i; - int err, shift; + int err; zio_t *zio; ASSERT(tx->tx_txg == 0); @@ -599,34 +598,48 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t return; dn = txh->txh_dnode; - /* first block */ - if (off != 0) - dmu_tx_count_write(txh, off, 1); - /* last block */ - if (len != DMU_OBJECT_END) - dmu_tx_count_write(txh, off+len, 1); - - dmu_tx_count_dnode(txh); - if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) return; if (len == DMU_OBJECT_END) len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; + dmu_tx_count_dnode(txh); + /* - * For i/o error checking, read the first and last level-0 - * blocks, and all the level-1 blocks. The above count_write's - * have already taken care of the level-0 blocks. + * For i/o error checking, we read the first and last level-0 + * blocks if they are not aligned, and all the level-1 blocks. + * + * Note: dbuf_free_range() assumes that we have not instantiated + * any level-0 dbufs that will be completely freed. Therefore we must + * exercise care to not read or count the first and last blocks + * if they are blocksize-aligned. + */ + if (dn->dn_datablkshift == 0) { + dmu_tx_count_write(txh, off, len); + } else { + /* first block will be modified if it is not aligned */ + if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) + dmu_tx_count_write(txh, off, 1); + /* last block will be modified if it is not aligned */ + if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) + dmu_tx_count_write(txh, off+len, 1); + } + + /* + * Check level-1 blocks. */ if (dn->dn_nlevels > 1) { - shift = dn->dn_datablkshift + dn->dn_indblkshift - + int shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; - start = off >> shift; - end = dn->dn_datablkshift ? ((off+len) >> shift) : 0; + uint64_t start = off >> shift; + uint64_t end = (off + len) >> shift; + + ASSERT(dn->dn_datablkshift != 0); + ASSERT(dn->dn_indblkshift != 0); zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); - for (i = start; i <= end; i++) { + for (uint64_t i = start; i <= end; i++) { uint64_t ibyte = i << shift; err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); i = ibyte >> shift; Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h Tue Jul 30 21:20:12 2013 (r253820) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h Tue Jul 30 21:35:02 2013 (r253821) @@ -21,8 +21,11 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + */ +/* * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright (c) 2012, Martin Matuska <mm@FreeBSD.org>. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. */ #ifndef _SYS_DMU_IMPL_H @@ -293,6 +296,9 @@ typedef struct dmu_sendarg { uint64_t dsa_toguid; int dsa_err; dmu_pendop_t dsa_pending_op; + boolean_t dsa_incremental; + uint64_t dsa_last_data_object; + uint64_t dsa_last_data_offset; } dmu_sendarg_t; Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h Tue Jul 30 21:20:12 2013 (r253820) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h Tue Jul 30 21:35:02 2013 (r253821) @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ @@ -74,5 +74,6 @@ int dmu_recv_stream(dmu_recv_cookie_t *d #endif int cleanup_fd, uint64_t *action_handlep); int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner); +boolean_t dmu_objset_is_receiving(objset_t *os); #endif /* _DMU_SEND_H */
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201307302135.r6ULZ2tu079748>