Skip site navigation (1)Skip section navigation (2)
Date:      Sun, 25 Nov 2012 16:32:08 +0000 (UTC)
From:      Martin Matuska <mm@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r243524 - in head: cddl/contrib/opensolaris/cmd/ztest sys/cddl/contrib/opensolaris/uts/common/fs/zfs sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys
Message-ID:  <201211251632.qAPGW8wd091581@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: mm
Date: Sun Nov 25 16:32:07 2012
New Revision: 243524
URL: http://svnweb.freebsd.org/changeset/base/243524

Log:
  MFV r243013 and r243267:
  
  Import the zio nop-write improvement from Illumos. To reduce I/O,
  nop-write omits overwriting data if the checksum (cryptographically
  secure) of new data matches the checksum of existing data.
  It also saves space if snapshots are in use.
  
  It currently works only on datasets with enabled compression, disabled
  deduplication and sha256 checksums.
  
  IllumOS 13887:196932ec9e6a and 13888:7204b3392a58
  3236 zio nop-write
  
  References:
  https://www.illumos.org/issues/3236
  
  MFC after:	2 weeks

Modified:
  head/cddl/contrib/opensolaris/cmd/ztest/ztest.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
Directory Properties:
  head/cddl/contrib/opensolaris/   (props changed)
  head/sys/cddl/contrib/opensolaris/   (props changed)

Modified: head/cddl/contrib/opensolaris/cmd/ztest/ztest.c
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/ztest/ztest.c	Sun Nov 25 16:19:12 2012	(r243523)
+++ head/cddl/contrib/opensolaris/cmd/ztest/ztest.c	Sun Nov 25 16:32:07 2012	(r243524)
@@ -204,6 +204,7 @@ enum ztest_io_type {
 	ZTEST_IO_WRITE_ZEROES,
 	ZTEST_IO_TRUNCATE,
 	ZTEST_IO_SETATTR,
+	ZTEST_IO_REWRITE,
 	ZTEST_IO_TYPES
 };
 
@@ -1867,6 +1868,12 @@ ztest_get_data(void *arg, lr_write_t *lr
 		    DMU_READ_NO_PREFETCH);
 
 		if (error == 0) {
+			blkptr_t *obp = dmu_buf_get_blkptr(db);
+			if (obp) {
+				ASSERT(BP_IS_HOLE(bp));
+				*bp = *obp;
+			}
+
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;
 
@@ -2012,6 +2019,9 @@ ztest_remove(ztest_ds_t *zd, ztest_od_t 
 			continue;
 		}
 
+		/*
+		 * No object was found.
+		 */
 		if (od->od_object == 0)
 			continue;
 
@@ -2127,6 +2137,7 @@ ztest_prealloc(ztest_ds_t *zd, uint64_t 
 static void
 ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
 {
+	int err;
 	ztest_block_tag_t wbt;
 	dmu_object_info_t doi;
 	enum ztest_io_type io_type;
@@ -2179,6 +2190,25 @@ ztest_io(ztest_ds_t *zd, uint64_t object
 	case ZTEST_IO_SETATTR:
 		(void) ztest_setattr(zd, object);
 		break;
+
+	case ZTEST_IO_REWRITE:
+		(void) rw_rdlock(&ztest_name_lock);
+		err = ztest_dsl_prop_set_uint64(zd->zd_name,
+		    ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa),
+		    B_FALSE);
+		VERIFY(err == 0 || err == ENOSPC);
+		err = ztest_dsl_prop_set_uint64(zd->zd_name,
+		    ZFS_PROP_COMPRESSION,
+		    ztest_random_dsl_prop(ZFS_PROP_COMPRESSION),
+		    B_FALSE);
+		VERIFY(err == 0 || err == ENOSPC);
+		(void) rw_unlock(&ztest_name_lock);
+
+		VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data,
+		    DMU_READ_NO_PREFETCH));
+
+		(void) ztest_write(zd, object, offset, blocksize, data);
+		break;
 	}
 
 	(void) rw_unlock(&zd->zd_zilog_lock);
@@ -2266,7 +2296,12 @@ ztest_zil_remount(ztest_ds_t *zd, uint64
 {
 	objset_t *os = zd->zd_os;
 
-	VERIFY(mutex_lock(&zd->zd_dirobj_lock) == 0);
+	/*
+	 * We grab the zd_dirobj_lock to ensure that no other thread is
+	 * updating the zil (i.e. adding in-memory log records) and the
+	 * zd_zilog_lock to block any I/O.
+	 */
+	VERIFY0(mutex_lock(&zd->zd_dirobj_lock));
 	(void) rw_wrlock(&zd->zd_zilog_lock);
 
 	/* zfsvfs_teardown() */
@@ -4925,8 +4960,8 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_
 	 */
 	for (int i = 0; i < copies; i++) {
 		uint64_t offset = i * blocksize;
-		VERIFY(dmu_buf_hold(os, object, offset, FTAG, &db,
-		    DMU_READ_NO_PREFETCH) == 0);
+		VERIFY0(dmu_buf_hold(os, object, offset, FTAG, &db,
+		    DMU_READ_NO_PREFETCH));
 		ASSERT(db->db_offset == offset);
 		ASSERT(db->db_size == blocksize);
 		ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) ||
@@ -4942,8 +4977,8 @@ ztest_ddt_repair(ztest_ds_t *zd, uint64_
 	/*
 	 * Find out what block we got.
 	 */
-	VERIFY(dmu_buf_hold(os, object, 0, FTAG, &db,
-	    DMU_READ_NO_PREFETCH) == 0);
+	VERIFY0(dmu_buf_hold(os, object, 0, FTAG, &db,
+	    DMU_READ_NO_PREFETCH));
 	blk = *((dmu_buf_impl_t *)db)->db_blkptr;
 	dmu_buf_rele(db, FTAG);
 
@@ -5621,6 +5656,8 @@ ztest_freeze(void)
 	kernel_init(FREAD | FWRITE);
 	VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG));
 	VERIFY3U(0, ==, ztest_dataset_open(0));
+	spa->spa_debug = B_TRUE;
+	ztest_spa = spa;
 
 	/*
 	 * Force the first log block to be transactionally allocated.

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	Sun Nov 25 16:19:12 2012	(r243523)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	Sun Nov 25 16:32:07 2012	(r243524)
@@ -3688,6 +3688,12 @@ arc_write_done(zio_t *zio)
 				arc_hdr_destroy(exists);
 				exists = buf_hash_insert(hdr, &hash_lock);
 				ASSERT3P(exists, ==, NULL);
+			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
+				/* nopwrite */
+				ASSERT(zio->io_prop.zp_nopwrite);
+				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
+					panic("bad nopwrite, hdr=%p exists=%p",
+					    (void *)hdr, (void *)exists);
 			} else {
 				/* Dedup */
 				ASSERT(hdr->b_datacnt == 1);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	Sun Nov 25 16:19:12 2012	(r243523)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	Sun Nov 25 16:32:07 2012	(r243524)
@@ -768,13 +768,15 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
 	ASSERT(db->db_data_pending != dr);
 
 	/* free this block */
-	if (!BP_IS_HOLE(bp)) {
+	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) {
 		spa_t *spa;
 
 		DB_GET_SPA(&spa, db);
 		zio_free(spa, txg, bp);
 	}
 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+	dr->dt.dl.dr_nopwrite = B_FALSE;
+
 	/*
 	 * Release the already-written buffer, so we leave it in
 	 * a consistent dirty state.  Note that all callers are
@@ -2189,6 +2191,13 @@ dmu_buf_freeable(dmu_buf_t *dbuf)
 	return (res);
 }
 
+blkptr_t *
+dmu_buf_get_blkptr(dmu_buf_t *db)
+{
+	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+	return (dbi->db_blkptr);
+}
+
 static void
 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
 {
@@ -2531,7 +2540,11 @@ dbuf_write_done(zio_t *zio, arc_buf_t *b
 	ASSERT0(zio->io_error);
 	ASSERT(db->db_blkptr == bp);
 
-	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+	/*
+	 * For nopwrites and rewrites we ensure that the bp matches our
+	 * original and bypass all the accounting.
+	 */
+	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
 		ASSERT(BP_EQUAL(bp, bp_orig));
 	} else {
 		objset_t *os;
@@ -2722,7 +2735,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_
 		mutex_enter(&db->db_mtx);
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
-		    dr->dt.dl.dr_copies);
+		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
 		mutex_exit(&db->db_mtx);
 	} else if (db->db_state == DB_NOFILL) {
 		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	Sun Nov 25 16:19:12 2012	(r243523)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	Sun Nov 25 16:32:07 2012	(r243524)
@@ -40,11 +40,17 @@
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
 #include <sys/sa.h>
 #ifdef _KERNEL
 #include <sys/zfs_znode.h>
 #endif
 
+/*
+ * Enable/disable nopwrite feature.
+ */
+int zfs_nopwrite_enabled = 1;
+
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{	DMU_BSWAP_UINT8,	TRUE,	"unallocated"		},
 	{	DMU_BSWAP_ZAP,		TRUE,	"object directory"	},
@@ -1287,6 +1293,16 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf
 	mutex_enter(&db->db_mtx);
 	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
 	if (zio->io_error == 0) {
+		dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
+		if (dr->dt.dl.dr_nopwrite) {
+			blkptr_t *bp = zio->io_bp;
+			blkptr_t *bp_orig = &zio->io_bp_orig;
+			uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
+
+			ASSERT(BP_EQUAL(bp, bp_orig));
+			ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
+			ASSERT(zio_checksum_table[chksum].ci_dedup);
+		}
 		dr->dt.dl.dr_overridden_by = *zio->io_bp;
 		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
 		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
@@ -1308,11 +1324,22 @@ dmu_sync_late_arrival_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	dmu_sync_arg_t *dsa = zio->io_private;
+	blkptr_t *bp_orig = &zio->io_bp_orig;
 
 	if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
-		ASSERT(zio->io_bp->blk_birth == zio->io_txg);
-		ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
-		zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+		/*
+		 * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE)
+		 * then there is nothing to do here. Otherwise, free the
+		 * newly allocated block in this txg.
+		 */
+		if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
+			ASSERT(BP_EQUAL(bp, bp_orig));
+		} else {
+			ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
+			ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+			ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
+			zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+		}
 	}
 
 	dmu_tx_commit(dsa->dsa_tx);
@@ -1357,7 +1384,7 @@ dmu_sync_late_arrival(zio_t *pio, objset
  *
  * Return values:
  *
- *	EEXIST: this txg has already been synced, so there's nothing to to.
+ *	EEXIST: this txg has already been synced, so there's nothing to do.
  *		The caller should not log the write.
  *
  *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
@@ -1389,7 +1416,6 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_s
 	dnode_t *dn;
 
 	ASSERT(pio != NULL);
-	ASSERT(BP_IS_HOLE(bp));
 	ASSERT(txg != 0);
 
 	SET_BOOKMARK(&zb, ds->ds_object,
@@ -1444,6 +1470,23 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_s
 		return (ENOENT);
 	}
 
+	ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
+
+	/*
+	 * Assume the on-disk data is X, the current syncing data is Y,
+	 * and the current in-memory data is Z (currently in dmu_sync).
+	 * X and Z are identical but Y is has been modified. Normally,
+	 * when X and Z are the same we will perform a nopwrite but if Y
+	 * is different we must disable nopwrite since the resulting write
+	 * of Y to disk can free the block containing X. If we allowed a
+	 * nopwrite to occur the block pointing to Z would reference a freed
+	 * block. Since this is a rare case we simplify this by disabling
+	 * nopwrite if the current dmu_sync-ing dbuf has been modified in
+	 * a previous transaction.
+	 */
+	if (dr->dr_next)
+		zp.zp_nopwrite = B_FALSE;
+
 	ASSERT(dr->dr_txg == txg);
 	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
@@ -1532,15 +1575,27 @@ dmu_write_policy(objset_t *os, dnode_t *
 	enum zio_checksum checksum = os->os_checksum;
 	enum zio_compress compress = os->os_compress;
 	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
-	boolean_t dedup;
+	boolean_t dedup = B_FALSE;
+	boolean_t nopwrite = B_FALSE;
 	boolean_t dedup_verify = os->os_dedup_verify;
 	int copies = os->os_copies;
 
 	/*
-	 * Determine checksum setting.
+	 * We maintain different write policies for each of the following
+	 * types of data:
+	 *	 1. metadata
+	 *	 2. preallocated blocks (i.e. level-0 blocks of a dump device)
+	 *	 3. all other level 0 blocks
 	 */
 	if (ismd) {
 		/*
+		 * XXX -- we should design a compression algorithm
+		 * that specializes in arrays of bps.
+		 */
+		compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
+		    ZIO_COMPRESS_LZJB;
+
+		/*
 		 * Metadata always gets checksummed.  If the data
 		 * checksum is multi-bit correctable, and it's not a
 		 * ZBT-style checksum, then it's suitable for metadata
@@ -1550,45 +1605,47 @@ dmu_write_policy(objset_t *os, dnode_t *
 		if (zio_checksum_table[checksum].ci_correctable < 1 ||
 		    zio_checksum_table[checksum].ci_eck)
 			checksum = ZIO_CHECKSUM_FLETCHER_4;
-	} else {
-		checksum = zio_checksum_select(dn->dn_checksum, checksum);
-	}
+	} else if (wp & WP_NOFILL) {
+		ASSERT(level == 0);
 
-	/*
-	 * Determine compression setting.
-	 */
-	if (ismd) {
 		/*
-		 * XXX -- we should design a compression algorithm
-		 * that specializes in arrays of bps.
+		 * If we're writing preallocated blocks, we aren't actually
+		 * writing them so don't set any policy properties.  These
+		 * blocks are currently only used by an external subsystem
+		 * outside of zfs (i.e. dump) and not written by the zio
+		 * pipeline.
 		 */
-		compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
-		    ZIO_COMPRESS_LZJB;
+		compress = ZIO_COMPRESS_OFF;
+		checksum = ZIO_CHECKSUM_OFF;
 	} else {
 		compress = zio_compress_select(dn->dn_compress, compress);
-	}
 
-	/*
-	 * Determine dedup setting.  If we are in dmu_sync(), we won't
-	 * actually dedup now because that's all done in syncing context;
-	 * but we do want to use the dedup checkum.  If the checksum is not
-	 * strong enough to ensure unique signatures, force dedup_verify.
-	 */
-	dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF);
-	if (dedup) {
-		checksum = dedup_checksum;
-		if (!zio_checksum_table[checksum].ci_dedup)
-			dedup_verify = 1;
-	}
+		checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
+		    zio_checksum_select(dn->dn_checksum, checksum) :
+		    dedup_checksum;
 
-	if (wp & WP_DMU_SYNC)
-		dedup = 0;
+		/*
+		 * Determine dedup setting.  If we are in dmu_sync(),
+		 * we won't actually dedup now because that's all
+		 * done in syncing context; but we do want to use the
+		 * dedup checkum.  If the checksum is not strong
+		 * enough to ensure unique signatures, force
+		 * dedup_verify.
+		 */
+		if (dedup_checksum != ZIO_CHECKSUM_OFF) {
+			dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
+			if (!zio_checksum_table[checksum].ci_dedup)
+				dedup_verify = B_TRUE;
+		}
 
-	if (wp & WP_NOFILL) {
-		ASSERT(!ismd && level == 0);
-		checksum = ZIO_CHECKSUM_OFF;
-		compress = ZIO_COMPRESS_OFF;
-		dedup = B_FALSE;
+		/*
+		 * Enable nopwrite if we have a cryptographically secure
+		 * checksum that has no known collisions (i.e. SHA-256)
+		 * and compression is enabled.  We don't enable nopwrite if
+		 * dedup is enabled as the two features are mutually exclusive.
+		 */
+		nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup &&
+		    compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
 	}
 
 	zp->zp_checksum = checksum;
@@ -1598,6 +1655,7 @@ dmu_write_policy(objset_t *os, dnode_t *
 	zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
 	zp->zp_dedup = dedup;
 	zp->zp_dedup_verify = dedup && dedup_verify;
+	zp->zp_nopwrite = nopwrite;
 }
 
 int

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c	Sun Nov 25 16:19:12 2012	(r243523)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c	Sun Nov 25 16:32:07 2012	(r243524)
@@ -440,7 +440,6 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t t
 	 * clean up our in-memory structures accumulated while syncing:
 	 *
 	 *  - move dead blocks from the pending deadlist to the on-disk deadlist
-	 *  - clean up zil records
 	 *  - release hold from dsl_dataset_dirty()
 	 */
 	while (ds = list_remove_head(&synced_datasets)) {

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h	Sun Nov 25 16:19:12 2012	(r243523)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h	Sun Nov 25 16:32:07 2012	(r243524)
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_DBUF_H
@@ -130,6 +131,7 @@ typedef struct dbuf_dirty_record {
 			blkptr_t dr_overridden_by;
 			override_states_t dr_override_state;
 			uint8_t dr_copies;
+			boolean_t dr_nopwrite;
 		} dl;
 	} dt;
 } dbuf_dirty_record_t;

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	Sun Nov 25 16:19:12 2012	(r243523)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h	Sun Nov 25 16:32:07 2012	(r243524)
@@ -505,6 +505,11 @@ void dmu_evict_user(objset_t *os, dmu_bu
 void *dmu_buf_get_user(dmu_buf_t *db);
 
 /*
+ * Returns the blkptr associated with this dbuf, or NULL if not set.
+ */
+struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
+
+/*
  * Indicate that you are going to modify the buffer's data (db_data).
  *
  * The transaction (tx) must be assigned to a txg (ie. you've called

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h	Sun Nov 25 16:19:12 2012	(r243523)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h	Sun Nov 25 16:32:07 2012	(r243524)
@@ -188,7 +188,9 @@ enum zio_flag {
 	ZIO_FLAG_RAW		= 1 << 21,
 	ZIO_FLAG_GANG_CHILD	= 1 << 22,
 	ZIO_FLAG_DDT_CHILD	= 1 << 23,
-	ZIO_FLAG_GODFATHER	= 1 << 24
+	ZIO_FLAG_GODFATHER	= 1 << 24,
+	ZIO_FLAG_NOPWRITE	= 1 << 25,
+	ZIO_FLAG_REEXECUTED	= 1 << 26,
 };
 
 #define	ZIO_FLAG_MUSTSUCCEED		0
@@ -287,8 +289,9 @@ typedef struct zio_prop {
 	dmu_object_type_t	zp_type;
 	uint8_t			zp_level;
 	uint8_t			zp_copies;
-	uint8_t			zp_dedup;
-	uint8_t			zp_dedup_verify;
+	boolean_t		zp_dedup;
+	boolean_t		zp_dedup_verify;
+	boolean_t		zp_nopwrite;
 } zio_prop_t;
 
 typedef struct zio_cksum_report zio_cksum_report_t;
@@ -491,7 +494,8 @@ extern zio_t *zio_rewrite(zio_t *pio, sp
     void *data, uint64_t size, zio_done_func_t *done, void *priv,
     int priority, enum zio_flag flags, zbookmark_t *zb);
 
-extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies);
+extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
+    boolean_t nopwrite);
 
 extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
 

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h	Sun Nov 25 16:19:12 2012	(r243523)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h	Sun Nov 25 16:32:07 2012	(r243524)
@@ -38,6 +38,70 @@ extern "C" {
 #endif
 
 /*
+ * XXX -- Describe ZFS I/O pipleine here. Fill in as needed.
+ *
+ * The ZFS I/O pipeline is comprised of various stages which are defined
+ * in the zio_stage enum below. The individual stages are used to construct
+ * these basic I/O operations: Read, Write, Free, Claim, and Ioctl.
+ *
+ * I/O operations: (XXX - provide detail for each of the operations)
+ *
+ * Read:
+ * Write:
+ * Free:
+ * Claim:
+ * Ioctl:
+ *
+ * Although the most common pipeline are used by the basic I/O operations
+ * above, there are some helper pipelines (one could consider them
+ * sub-pipelines) which are used internally by the ZIO module and are
+ * explained below:
+ *
+ * Interlock Pipeline:
+ * The interlock pipeline is the most basic pipeline and is used by all
+ * of the I/O operations. The interlock pipeline does not perform any I/O
+ * and is used to coordinate the dependencies between I/Os that are being
+ * issued (i.e. the parent/child relationship).
+ *
+ * Vdev child Pipeline:
+ * The vdev child pipeline is responsible for performing the physical I/O.
+ * It is in this pipeline where the I/O are queued and possibly cached.
+ *
+ * In addition to performing I/O, the pipeline is also responsible for
+ * data transformations. The transformations performed are based on the
+ * specific properties that user may have selected and modify the
+ * behavior of the pipeline. Examples of supported transformations are
+ * compression, dedup, and nop writes. Transformations will either modify
+ * the data or the pipeline. This list below further describes each of
+ * the supported transformations:
+ *
+ * Compression:
+ * ZFS supports three different flavors of compression -- gzip, lzjb, and
+ * zle. Compression occurs as part of the write pipeline and is performed
+ * in the ZIO_STAGE_WRITE_BP_INIT stage.
+ *
+ * Dedup:
+ * Dedup reads are handled by the ZIO_STAGE_DDT_READ_START and
+ * ZIO_STAGE_DDT_READ_DONE stages. These stages are added to an existing
+ * read pipeline if the dedup bit is set on the block pointer.
+ * Writing a dedup block is performed by the ZIO_STAGE_DDT_WRITE stage
+ * and added to a write pipeline if a user has enabled dedup on that
+ * particular dataset.
+ *
+ * NOP Write:
+ * The NOP write feature is performed by the ZIO_STAGE_NOP_WRITE stage
+ * and is added to an existing write pipeline if a crypographically
+ * secure checksum (i.e. SHA256) is enabled and compression is turned on.
+ * The NOP write stage will compare the checksums of the current data
+ * on-disk (level-0 blocks only) and the data that is currently being written.
+ * If the checksum values are identical then the pipeline is converted to
+ * an interlock pipeline skipping block allocation and bypassing the
+ * physical I/O.  The nop write feature can handle writes in either
+ * syncing or open context (i.e. zil writes) and as a result is mutually
+ * exclusive with dedup.
+ */
+
+/*
  * zio pipeline stage definitions
  */
 enum zio_stage {
@@ -50,27 +114,29 @@ enum zio_stage {
 
 	ZIO_STAGE_CHECKSUM_GENERATE	= 1 << 5,	/* -W--- */
 
-	ZIO_STAGE_DDT_READ_START	= 1 << 6,	/* R---- */
-	ZIO_STAGE_DDT_READ_DONE		= 1 << 7,	/* R---- */
-	ZIO_STAGE_DDT_WRITE		= 1 << 8,	/* -W--- */
-	ZIO_STAGE_DDT_FREE		= 1 << 9,	/* --F-- */
+	ZIO_STAGE_NOP_WRITE		= 1 << 6,	/* -W--- */
+
+	ZIO_STAGE_DDT_READ_START	= 1 << 7,	/* R---- */
+	ZIO_STAGE_DDT_READ_DONE		= 1 << 8,	/* R---- */
+	ZIO_STAGE_DDT_WRITE		= 1 << 9,	/* -W--- */
+	ZIO_STAGE_DDT_FREE		= 1 << 10,	/* --F-- */
 
-	ZIO_STAGE_GANG_ASSEMBLE		= 1 << 10,	/* RWFC- */
-	ZIO_STAGE_GANG_ISSUE		= 1 << 11,	/* RWFC- */
+	ZIO_STAGE_GANG_ASSEMBLE		= 1 << 11,	/* RWFC- */
+	ZIO_STAGE_GANG_ISSUE		= 1 << 12,	/* RWFC- */
 
-	ZIO_STAGE_DVA_ALLOCATE		= 1 << 12,	/* -W--- */
-	ZIO_STAGE_DVA_FREE		= 1 << 13,	/* --F-- */
-	ZIO_STAGE_DVA_CLAIM		= 1 << 14,	/* ---C- */
+	ZIO_STAGE_DVA_ALLOCATE		= 1 << 13,	/* -W--- */
+	ZIO_STAGE_DVA_FREE		= 1 << 14,	/* --F-- */
+	ZIO_STAGE_DVA_CLAIM		= 1 << 15,	/* ---C- */
 
-	ZIO_STAGE_READY			= 1 << 15,	/* RWFCI */
+	ZIO_STAGE_READY			= 1 << 16,	/* RWFCI */
 
-	ZIO_STAGE_VDEV_IO_START		= 1 << 16,	/* RWF-I */
-	ZIO_STAGE_VDEV_IO_DONE		= 1 << 17,	/* RWF-- */
-	ZIO_STAGE_VDEV_IO_ASSESS	= 1 << 18,	/* RWF-I */
+	ZIO_STAGE_VDEV_IO_START		= 1 << 17,	/* RWF-I */
+	ZIO_STAGE_VDEV_IO_DONE		= 1 << 18,	/* RWF-- */
+	ZIO_STAGE_VDEV_IO_ASSESS	= 1 << 19,	/* RWF-I */
 
-	ZIO_STAGE_CHECKSUM_VERIFY	= 1 << 19,	/* R---- */
+	ZIO_STAGE_CHECKSUM_VERIFY	= 1 << 20,	/* R---- */
 
-	ZIO_STAGE_DONE			= 1 << 20	/* RWFCI */
+	ZIO_STAGE_DONE			= 1 << 21	/* RWFCI */
 };
 
 #define	ZIO_INTERLOCK_STAGES			\

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	Sun Nov 25 16:19:12 2012	(r243523)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c	Sun Nov 25 16:32:07 2012	(r243524)
@@ -1203,6 +1203,12 @@ zfs_get_data(void *arg, lr_write_t *lr, 
 			    DMU_READ_NO_PREFETCH);
 
 		if (error == 0) {
+			blkptr_t *obp = dmu_buf_get_blkptr(db);
+			if (obp) {
+				ASSERT(BP_IS_HOLE(bp));
+				*bp = *obp;
+			}
+
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;
 

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c	Sun Nov 25 16:19:12 2012	(r243523)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c	Sun Nov 25 16:32:07 2012	(r243524)
@@ -709,9 +709,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64
 	    DMU_OT_IS_VALID(zp->zp_type) &&
 	    zp->zp_level < 32 &&
 	    zp->zp_copies > 0 &&
-	    zp->zp_copies <= spa_max_replication(spa) &&
-	    zp->zp_dedup <= 1 &&
-	    zp->zp_dedup_verify <= 1);
+	    zp->zp_copies <= spa_max_replication(spa));
 
 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
@@ -739,13 +737,20 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint
 }
 
 void
-zio_write_override(zio_t *zio, blkptr_t *bp, int copies)
+zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
 {
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 
+	/*
+	 * We must reset the io_prop to match the values that existed
+	 * when the bp was first written by dmu_sync() keeping in mind
+	 * that nopwrite and dedup are mutually exclusive.
+	 */
+	zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
+	zio->io_prop.zp_nopwrite = nopwrite;
 	zio->io_prop.zp_copies = copies;
 	zio->io_bp_override = bp;
 }
@@ -1045,6 +1050,19 @@ zio_write_bp_init(zio_t *zio)
 		*bp = *zio->io_bp_override;
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
+		/*
+		 * If we've been overridden and nopwrite is set then
+		 * set the flag accordingly to indicate that a nopwrite
+		 * has already occurred.
+		 */
+		if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
+			ASSERT(!zp->zp_dedup);
+			zio->io_flags |= ZIO_FLAG_NOPWRITE;
+			return (ZIO_PIPELINE_CONTINUE);
+		}
+
+		ASSERT(!zp->zp_nopwrite);
+
 		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
 			return (ZIO_PIPELINE_CONTINUE);
 
@@ -1132,6 +1150,11 @@ zio_write_bp_init(zio_t *zio)
 			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
 			zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
 		}
+		if (zp->zp_nopwrite) {
+			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+			zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
+		}
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
@@ -1353,6 +1376,7 @@ zio_reexecute(zio_t *pio)
 	pio->io_stage = pio->io_orig_stage;
 	pio->io_pipeline = pio->io_orig_pipeline;
 	pio->io_reexecute = 0;
+	pio->io_flags |= ZIO_FLAG_REEXECUTED;
 	pio->io_error = 0;
 	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 		pio->io_state[w] = 0;
@@ -1829,8 +1853,9 @@ zio_write_gang_block(zio_t *pio)
 		zp.zp_type = DMU_OT_NONE;
 		zp.zp_level = 0;
 		zp.zp_copies = gio->io_prop.zp_copies;
-		zp.zp_dedup = 0;
-		zp.zp_dedup_verify = 0;
+		zp.zp_dedup = B_FALSE;
+		zp.zp_dedup_verify = B_FALSE;
+		zp.zp_nopwrite = B_FALSE;
 
 		zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
 		    (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
@@ -1850,6 +1875,62 @@ zio_write_gang_block(zio_t *pio)
 }
 
 /*
+ * The zio_nop_write stage in the pipeline determines if allocating
+ * a new bp is necessary.  By leveraging a cryptographically secure checksum,
+ * such as SHA256, we can compare the checksums of the new data and the old
+ * to determine if allocating a new block is required.  The nopwrite
+ * feature can handle writes in either syncing or open context (i.e. zil
+ * writes) and as a result is mutually exclusive with dedup.
+ */
+static int
+zio_nop_write(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	blkptr_t *bp_orig = &zio->io_bp_orig;
+	zio_prop_t *zp = &zio->io_prop;
+
+	ASSERT(BP_GET_LEVEL(bp) == 0);
+	ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+	ASSERT(zp->zp_nopwrite);
+	ASSERT(!zp->zp_dedup);
+	ASSERT(zio->io_bp_override == NULL);
+	ASSERT(IO_IS_ALLOCATING(zio));
+
+	/*
+	 * Check to see if the original bp and the new bp have matching
+	 * characteristics (i.e. same checksum, compression algorithms, etc).
+	 * If they don't then just continue with the pipeline which will
+	 * allocate a new bp.
+	 */
+	if (BP_IS_HOLE(bp_orig) ||
+	    !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
+	    BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
+	    BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
+	    BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
+	    zp->zp_copies != BP_GET_NDVAS(bp_orig))
+		return (ZIO_PIPELINE_CONTINUE);
+
+	/*
+	 * If the checksums match then reset the pipeline so that we
+	 * avoid allocating a new bp and issuing any I/O.
+	 */
+	if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
+		ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
+		ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
+		ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
+		ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
+		ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
+		    sizeof (uint64_t)) == 0);
+
+		*bp = *bp_orig;
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+		zio->io_flags |= ZIO_FLAG_NOPWRITE;
+	}
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+/*
  * ==========================================================================
  * Dedup
  * ==========================================================================
@@ -2121,7 +2202,7 @@ zio_ddt_write(zio_t *zio)
 			zio->io_stage = ZIO_STAGE_OPEN;
 			BP_ZERO(bp);
 		} else {
-			zp->zp_dedup = 0;
+			zp->zp_dedup = B_FALSE;
 		}
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 		ddt_exit(ddt);
@@ -2778,7 +2859,8 @@ zio_ready(zio_t *zio)
 
 	if (zio->io_ready) {
 		ASSERT(IO_IS_ALLOCATING(zio));
-		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
+		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
+		    (zio->io_flags & ZIO_FLAG_NOPWRITE));
 		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
 
 		zio->io_ready(zio);
@@ -2860,6 +2942,8 @@ zio_done(zio_t *zio)
 			ASSERT(BP_COUNT_GANG(bp) == 0 ||
 			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
 		}
+		if (zio->io_flags & ZIO_FLAG_NOPWRITE)
+			VERIFY(BP_EQUAL(bp, &zio->io_bp_orig));
 	}
 
 	/*
@@ -2969,7 +3053,7 @@ zio_done(zio_t *zio)
 
 	if ((zio->io_error || zio->io_reexecute) &&
 	    IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
-	    !(zio->io_flags & ZIO_FLAG_IO_REWRITE))
+	    !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
 		zio_dva_unallocate(zio, zio->io_gang_tree, bp);
 
 	zio_gang_tree_free(&zio->io_gang_tree);
@@ -3113,6 +3197,7 @@ static zio_pipe_stage_t *zio_pipeline[] 
 	zio_issue_async,
 	zio_write_bp_init,
 	zio_checksum_generate,
+	zio_nop_write,
 	zio_ddt_read_start,
 	zio_ddt_read_done,
 	zio_ddt_write,

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c	Sun Nov 25 16:19:12 2012	(r243523)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c	Sun Nov 25 16:32:07 2012	(r243524)
@@ -78,6 +78,7 @@
 #include <sys/vdev_impl.h>
 #include <sys/zvol.h>
 #include <sys/zil_impl.h>
+#include <sys/dbuf.h>
 #include <geom/geom.h>
 
 #include "zfs_namecheck.h"
@@ -1051,6 +1052,12 @@ zvol_get_data(void *arg, lr_write_t *lr,
 		error = dmu_buf_hold(os, object, offset, zgd, &db,
 		    DMU_READ_NO_PREFETCH);
 		if (error == 0) {
+			blkptr_t *obp = dmu_buf_get_blkptr(db);
+			if (obp) {
+				ASSERT(BP_IS_HOLE(bp));
+				*bp = *obp;
+			}
+
 			zgd->zgd_db = db;
 			zgd->zgd_bp = bp;
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201211251632.qAPGW8wd091581>