Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 21 Feb 2018 14:33:00 +0000 (UTC)
From:      Andriy Gapon <avg@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org
Subject:   svn commit: r329715 - in vendor-sys/illumos/dist/uts/common/fs/zfs: . sys
Message-ID:  <201802211433.w1LEX0aZ002743@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: avg
Date: Wed Feb 21 14:33:00 2018
New Revision: 329715
URL: https://svnweb.freebsd.org/changeset/base/329715

Log:
  8997 ztest assertion failure in zil_lwb_write_issue
  
  illumos/illumos-gate@f864f99efe57685e1762590c1a880dd16bca6da9
  https://github.com/illumos/illumos-gate/commit/f864f99efe57685e1762590c1a880dd16bca6da9
  
  https://www.illumos.org/issues/8997
    When dmu_tx_assign is called from zil_lwb_write_issue, it's possible
    for either ERESTART or EIO to be returned.
    If ERESTART is returned, this will cause an assertion to fail directly
    in zil_lwb_write_issue, where the code assumes the return value is
    EIO if dmu_tx_assign returns a non-zero value. This can occur if the
    SPA is suspended when dmu_tx_assign is called, and most often occurs
    when running zloop.
    If EIO is returned, this can cause assertions to fail elsewhere in the
    ZIL code. For example, zil_commit_waiter_timeout contains the
    following logic:
      lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb);
      ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED);
    In this case, if dmu_tx_assign returned EIO from within
    zil_lwb_write_issue, the lwb variable passed in will not be issued
    to disk. Thus, it's lwb_state field will remain LWB_STATE_OPENED and
    this assertion will fail. zil_commit_waiter_timeout assumes that after
    it calls zil_lwb_write_issue, the lwb will be issued to disk, and
    doesn't handle the case where this is not true; i.e. it doesn't handle
    the case where dmu_tx_assign returns EIO.
  
  Reviewed by: Matt Ahrens <mahrens@delphix.com>
  Reviewed by: Andriy Gapon <avg@FreeBSD.org>
  Approved by: Robert Mustacchi <rm@joyent.com>
  Author: Prakash Surya <prakash.surya@delphix.com>

Modified:
  vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_tx.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu_tx.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vnops.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zil.c

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_tx.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_tx.c	Wed Feb 21 14:31:48 2018	(r329714)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_tx.c	Wed Feb 21 14:33:00 2018	(r329715)
@@ -869,7 +869,7 @@ dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
  * decreasing performance.
  */
 static int
-dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
+dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
 {
 	spa_t *spa = tx->tx_pool->dp_spa;
 
@@ -889,13 +889,13 @@ dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
 		 * of the failuremode setting.
 		 */
 		if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
-		    txg_how != TXG_WAIT)
+		    !(txg_how & TXG_WAIT))
 			return (SET_ERROR(EIO));
 
 		return (SET_ERROR(ERESTART));
 	}
 
-	if (!tx->tx_waited &&
+	if (!tx->tx_dirty_delayed &&
 	    dsl_pool_need_dirty_delay(tx->tx_pool)) {
 		tx->tx_wait_dirty = B_TRUE;
 		return (SET_ERROR(ERESTART));
@@ -983,41 +983,44 @@ dmu_tx_unassign(dmu_tx_t *tx)
 }
 
 /*
- * Assign tx to a transaction group.  txg_how can be one of:
+ * Assign tx to a transaction group; txg_how is a bitmask:
  *
- * (1)	TXG_WAIT.  If the current open txg is full, waits until there's
- *	a new one.  This should be used when you're not holding locks.
- *	It will only fail if we're truly out of space (or over quota).
+ * If TXG_WAIT is set and the currently open txg is full, this function
+ * will wait until there's a new txg. This should be used when no locks
+ * are being held. With this bit set, this function will only fail if
+ * we're truly out of space (or over quota).
  *
- * (2)	TXG_NOWAIT.  If we can't assign into the current open txg without
- *	blocking, returns immediately with ERESTART.  This should be used
- *	whenever you're holding locks.  On an ERESTART error, the caller
- *	should drop locks, do a dmu_tx_wait(tx), and try again.
+ * If TXG_WAIT is *not* set and we can't assign into the currently open
+ * txg without blocking, this function will return immediately with
+ * ERESTART. This should be used whenever locks are being held.  On an
+ * ERESTART error, the caller should drop all locks, call dmu_tx_wait(),
+ * and try again.
  *
- * (3)  TXG_WAITED.  Like TXG_NOWAIT, but indicates that dmu_tx_wait()
- *      has already been called on behalf of this operation (though
- *      most likely on a different tx).
+ * If TXG_NOTHROTTLE is set, this indicates that this tx should not be
+ * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for
+ * details on the throttle). This is used by the VFS operations, after
+ * they have already called dmu_tx_wait() (though most likely on a
+ * different tx).
  */
 int
-dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
+dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
 {
 	int err;
 
 	ASSERT(tx->tx_txg == 0);
-	ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
-	    txg_how == TXG_WAITED);
+	ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE));
 	ASSERT(!dsl_pool_sync_context(tx->tx_pool));
 
 	/* If we might wait, we must not hold the config lock. */
-	ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
+	IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool));
 
-	if (txg_how == TXG_WAITED)
-		tx->tx_waited = B_TRUE;
+	if ((txg_how & TXG_NOTHROTTLE))
+		tx->tx_dirty_delayed = B_TRUE;
 
 	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
 		dmu_tx_unassign(tx);
 
-		if (err != ERESTART || txg_how != TXG_WAIT)
+		if (err != ERESTART || !(txg_how & TXG_WAIT))
 			return (err);
 
 		dmu_tx_wait(tx);
@@ -1054,12 +1057,12 @@ dmu_tx_wait(dmu_tx_t *tx)
 		tx->tx_wait_dirty = B_FALSE;
 
 		/*
-		 * Note: setting tx_waited only has effect if the caller
-		 * used TX_WAIT.  Otherwise they are going to destroy
-		 * this tx and try again.  The common case, zfs_write(),
-		 * uses TX_WAIT.
+		 * Note: setting tx_dirty_delayed only has effect if the
+		 * caller used TX_WAIT.  Otherwise they are going to
+		 * destroy this tx and try again.  The common case,
+		 * zfs_write(), uses TX_WAIT.
 		 */
-		tx->tx_waited = B_TRUE;
+		tx->tx_dirty_delayed = B_TRUE;
 	} else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
 		/*
 		 * If the pool is suspended we need to wait until it

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h	Wed Feb 21 14:31:48 2018	(r329714)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h	Wed Feb 21 14:33:00 2018	(r329715)
@@ -231,11 +231,14 @@ typedef enum dmu_object_type {
 	DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE),
 } dmu_object_type_t;
 
-typedef enum txg_how {
-	TXG_WAIT = 1,
-	TXG_NOWAIT,
-	TXG_WAITED,
-} txg_how_t;
+/*
+ * These flags are intended to be used to specify the "txg_how"
+ * parameter when calling the dmu_tx_assign() function. See the comment
+ * above dmu_tx_assign() for more details on the meaning of these flags.
+ */
+#define	TXG_NOWAIT	(0ULL)
+#define	TXG_WAIT	(1ULL<<0)
+#define	TXG_NOTHROTTLE	(1ULL<<1)
 
 void byteswap_uint64_array(void *buf, size_t size);
 void byteswap_uint32_array(void *buf, size_t size);
@@ -689,7 +692,7 @@ void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
 void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
 void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
 void dmu_tx_abort(dmu_tx_t *tx);
-int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how);
+int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
 void dmu_tx_wait(dmu_tx_t *tx);
 void dmu_tx_commit(dmu_tx_t *tx);
 void dmu_tx_mark_netfree(dmu_tx_t *tx);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu_tx.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu_tx.h	Wed Feb 21 14:31:48 2018	(r329714)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu_tx.h	Wed Feb 21 14:33:00 2018	(r329715)
@@ -67,9 +67,6 @@ struct dmu_tx {
 	/* placeholder for syncing context, doesn't need specific holds */
 	boolean_t tx_anyobj;
 
-	/* has this transaction already been delayed? */
-	boolean_t tx_waited;
-
 	/* transaction is marked as being a "net free" of space */
 	boolean_t tx_netfree;
 
@@ -79,6 +76,9 @@ struct dmu_tx {
 	/* need to wait for sufficient dirty space */
 	boolean_t tx_wait_dirty;
 
+	/* has this transaction already been delayed? */
+	boolean_t tx_dirty_delayed;
+
 	int tx_err;
 };
 
@@ -114,7 +114,7 @@ typedef struct dmu_tx_callback {
  * These routines are defined in dmu.h, and are called by the user.
  */
 dmu_tx_t *dmu_tx_create(objset_t *dd);
-int dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how);
+int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
 void dmu_tx_commit(dmu_tx_t *tx);
 void dmu_tx_abort(dmu_tx_t *tx);
 uint64_t dmu_tx_get_txg(dmu_tx_t *tx);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vnops.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vnops.c	Wed Feb 21 14:31:48 2018	(r329714)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vnops.c	Wed Feb 21 14:33:00 2018	(r329715)
@@ -135,7 +135,7 @@
  *
  *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
  *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
- *	calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
+ *	calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
  *	to indicate that this operation has already called dmu_tx_wait().
  *	This will ensure that we don't retry forever, waiting a short bit
  *	each time.
@@ -160,7 +160,7 @@
  *	rw_enter(...);			// grab any other locks you need
  *	tx = dmu_tx_create(...);	// get DMU tx
  *	dmu_tx_hold_*();		// hold each object you might modify
- *	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+ *	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
  *	if (error) {
  *		rw_exit(...);		// drop locks
  *		zfs_dirent_unlock(dl);	// unlock directory entry
@@ -1532,7 +1532,8 @@ top:
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, acl_ids.z_aclp->z_acl_bytes);
 		}
-		error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+		error = dmu_tx_assign(tx,
+		    (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 		if (error) {
 			zfs_dirent_unlock(dl);
 			if (error == ERESTART) {
@@ -1762,7 +1763,7 @@ top:
 	 */
 	dmu_tx_mark_netfree(tx);
 
-	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		VN_RELE(vp);
@@ -1999,7 +2000,7 @@ top:
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 
-	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
@@ -2136,7 +2137,7 @@ top:
 	zfs_sa_upgrade_txholds(tx, zp);
 	zfs_sa_upgrade_txholds(tx, dzp);
 	dmu_tx_mark_netfree(tx);
-	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 	if (error) {
 		rw_exit(&zp->z_parent_lock);
 		rw_exit(&zp->z_name_lock);
@@ -3709,7 +3710,7 @@ top:
 
 	zfs_sa_upgrade_txholds(tx, szp);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 	if (error) {
 		if (zl != NULL)
 			zfs_rename_unlock(&zl);
@@ -3902,7 +3903,7 @@ top:
 	}
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
-	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {
@@ -4123,7 +4124,7 @@ top:
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
 	zfs_sa_upgrade_txholds(tx, szp);
 	zfs_sa_upgrade_txholds(tx, dzp);
-	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
+	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		if (error == ERESTART) {

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/zil.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/zil.c	Wed Feb 21 14:31:48 2018	(r329714)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/zil.c	Wed Feb 21 14:33:00 2018	(r329715)
@@ -1208,22 +1208,13 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
 	tx = dmu_tx_create(zilog->zl_os);
 
 	/*
-	 * Since we are not going to create any new dirty data and we can even
-	 * help with clearing the existing dirty data, we should not be subject
-	 * to the dirty data based delays.
-	 * We (ab)use TXG_WAITED to bypass the delay mechanism.
-	 * One side effect from using TXG_WAITED is that dmu_tx_assign() can
-	 * fail if the pool is suspended.  Those are dramatic circumstances,
-	 * so we return NULL to signal that the normal ZIL processing is not
-	 * possible and txg_wait_synced() should be used to ensure that the data
-	 * is on disk.
+	 * Since we are not going to create any new dirty data, and we
+	 * can even help with clearing the existing dirty data, we
+	 * should not be subject to the dirty data based delays. We
+	 * use TXG_NOTHROTTLE to bypass the delay mechanism.
 	 */
-	error = dmu_tx_assign(tx, TXG_WAITED);
-	if (error != 0) {
-		ASSERT3S(error, ==, EIO);
-		dmu_tx_abort(tx);
-		return (NULL);
-	}
+	VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
+
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 	txg = dmu_tx_get_txg(tx);
 



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201802211433.w1LEX0aZ002743>