Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 30 Jun 2017 09:34:50 +0000 (UTC)
From:      Andriy Gapon <avg@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-10@freebsd.org
Subject:   svn commit: r320496 - in stable/10: cddl/contrib/opensolaris/cmd/ztest sys/cddl/contrib/opensolaris/uts/common/fs/zfs sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys
Message-ID:  <201706300934.v5U9Yo8k026872@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: avg
Date: Fri Jun 30 09:34:50 2017
New Revision: 320496
URL: https://svnweb.freebsd.org/changeset/base/320496

Log:
  MFC r308782:
  After some ZIL changes 6 years ago zil_slog_limit got partially broken
  due to zl_itx_list_sz not updated when async itx'es upgraded to sync.
  Actually because of other changes about that time zl_itx_list_sz is not
  really required to implement the functionality, so this patch removes
  some unneeded broken code and variables.
  
  Original idea of zil_slog_limit was to reduce chance of SLOG abuse by
  single heavy logger, that increased latency for other (more latency critical)
  loggers, by pushing heavy log out into the main pool instead of SLOG. Beside
  huge latency increase for heavy writers, this implementation caused double
  write of all data, since the log records were explicitly prepared for SLOG.
  Since we now have I/O scheduler, I've found it can be much more efficient
  to reduce priority of heavy logger SLOG writes from ZIO_PRIORITY_SYNC_WRITE
  to ZIO_PRIORITY_ASYNC_WRITE, while still leave them on SLOG.
  
  Existing ZIL implementation had problem with space efficiency when it
  has to write large chunks of data into log blocks of limited size. In some
  cases efficiency stopped to almost as low as 50%. In case of ZIL stored on
  spinning rust, that also reduced log write speed in half, since head had to
  uselessly fly over allocated but not written areas. This change improves
  the situation by offloading problematic operations from z*_log_write() to
  zil_lwb_commit(), which knows real situation of log blocks allocation and
  can split large requests into pieces much more efficiently. Also as side
  effect it removes one of two data copy operations done by ZIL code WR_COPIED
  case.
  
  While there, untangle and unify code of z*_log_write() functions.
  Also zfs_log_write() alike to zvol_log_write() can now handle writes crossing
  block boundary, that may also improve efficiency if ZPL is made to do that.

Modified:
  stable/10/cddl/contrib/opensolaris/cmd/ztest/ztest.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
  stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
Directory Properties:
  stable/10/   (props changed)

Modified: stable/10/cddl/contrib/opensolaris/cmd/ztest/ztest.c
==============================================================================
--- stable/10/cddl/contrib/opensolaris/cmd/ztest/ztest.c	Fri Jun 30 07:48:08 2017	(r320495)
+++ stable/10/cddl/contrib/opensolaris/cmd/ztest/ztest.c	Fri Jun 30 09:34:50 2017	(r320496)
@@ -1371,7 +1371,6 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write
 	itx->itx_private = zd;
 	itx->itx_wr_state = write_state;
 	itx->itx_sync = (ztest_random(8) == 0);
-	itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0);
 
 	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
 	    sizeof (*lr) - sizeof (lr_t));

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h	Fri Jun 30 07:48:08 2017	(r320495)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h	Fri Jun 30 09:34:50 2017	(r320496)
@@ -369,7 +369,6 @@ typedef struct itx {
 	void		*itx_private;	/* type-specific opaque data */
 	itx_wr_state_t	itx_wr_state;	/* write state */
 	uint8_t		itx_sync;	/* synchronous transaction */
-	uint64_t	itx_sod;	/* record size on disk */
 	uint64_t	itx_oid;	/* object id */
 	lr_t		itx_lr;		/* common part of log record */
 	/* followed by type-specific part of lr_xx_t and its immediate data */

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h	Fri Jun 30 07:48:08 2017	(r320495)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h	Fri Jun 30 09:34:50 2017	(r320496)
@@ -42,6 +42,7 @@ extern "C" {
 typedef struct lwb {
 	zilog_t		*lwb_zilog;	/* back pointer to log struct */
 	blkptr_t	lwb_blk;	/* on disk address of this log blk */
+	boolean_t	lwb_slog;	/* lwb_blk is on SLOG device */
 	int		lwb_nused;	/* # used bytes in buffer */
 	int		lwb_sz;		/* size of block and buffer */
 	char		*lwb_buf;	/* log write buffer */
@@ -62,7 +63,6 @@ typedef struct itxs {
 typedef struct itxg {
 	kmutex_t	itxg_lock;	/* lock for this structure */
 	uint64_t	itxg_txg;	/* txg for this chain */
-	uint64_t	itxg_sod;	/* total size on disk for this txg */
 	itxs_t		*itxg_itxs;	/* sync and async itxs */
 } itxg_t;
 
@@ -120,7 +120,6 @@ struct zilog {
 	kcondvar_t	zl_cv_batch[2];	/* batch condition variables */
 	itxg_t		zl_itxg[TXG_SIZE]; /* intent log txg chains */
 	list_t		zl_itx_commit_list; /* itx list to be committed */
-	uint64_t	zl_itx_list_sz;	/* total size of records on list */
 	uint64_t	zl_cur_used;	/* current commit log size used */
 	list_t		zl_lwb_list;	/* in-flight log write list */
 	kmutex_t	zl_vdev_lock;	/* protects zl_vdev_tree */
@@ -142,6 +141,8 @@ typedef struct zil_bp_node {
 
 #define	ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
     sizeof (lr_write_t))
+#define	ZIL_MAX_COPIED_DATA \
+    ((SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t)) / 2 - sizeof (lr_write_t))
 
 #ifdef	__cplusplus
 }

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h	Fri Jun 30 07:48:08 2017	(r320495)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h	Fri Jun 30 09:34:50 2017	(r320496)
@@ -546,7 +546,7 @@ extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, ui
     const blkptr_t *bp, uint64_t size, enum zio_flag flags);
 
 extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
-    blkptr_t *old_bp, uint64_t size, boolean_t use_slog);
+    blkptr_t *old_bp, uint64_t size, boolean_t *slog);
 extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
 extern void zio_flush(zio_t *zio, vdev_t *vd);
 extern zio_t *zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset,

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c	Fri Jun 30 07:48:08 2017	(r320495)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c	Fri Jun 30 09:34:50 2017	(r320496)
@@ -459,21 +459,18 @@ void
 zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
     znode_t *zp, offset_t off, ssize_t resid, int ioflag)
 {
+	uint32_t blocksize = zp->z_blksz;
 	itx_wr_state_t write_state;
-	boolean_t slogging;
 	uintptr_t fsync_cnt;
-	ssize_t immediate_write_sz;
 
 	if (zil_replaying(zilog, tx) || zp->z_unlinked)
 		return;
 
-	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
-	    ? 0 : zfs_immediate_write_sz;
-
-	slogging = spa_has_slogs(zilog->zl_spa) &&
-	    (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
-	if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz)
+	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
 		write_state = WR_INDIRECT;
+	else if (!spa_has_slogs(zilog->zl_spa) &&
+	    resid >= zfs_immediate_write_sz)
+		write_state = WR_INDIRECT;
 	else if (ioflag & (FSYNC | FDSYNC))
 		write_state = WR_COPIED;
 	else
@@ -486,30 +483,26 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype
 	while (resid) {
 		itx_t *itx;
 		lr_write_t *lr;
-		ssize_t len;
+		itx_wr_state_t wr_state = write_state;
+		ssize_t len = resid;
 
-		/*
-		 * If the write would overflow the largest block then split it.
-		 */
-		if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA)
-			len = SPA_OLD_MAXBLOCKSIZE >> 1;
-		else
-			len = resid;
+		if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
+			wr_state = WR_NEED_COPY;
+		else if (wr_state == WR_INDIRECT)
+			len = MIN(blocksize - P2PHASE(off, blocksize), resid);
 
 		itx = zil_itx_create(txtype, sizeof (*lr) +
-		    (write_state == WR_COPIED ? len : 0));
+		    (wr_state == WR_COPIED ? len : 0));
 		lr = (lr_write_t *)&itx->itx_lr;
-		if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
+		if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
 		    zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
 			zil_itx_destroy(itx);
 			itx = zil_itx_create(txtype, sizeof (*lr));
 			lr = (lr_write_t *)&itx->itx_lr;
-			write_state = WR_NEED_COPY;
+			wr_state = WR_NEED_COPY;
 		}
 
-		itx->itx_wr_state = write_state;
-		if (write_state == WR_NEED_COPY)
-			itx->itx_sod += len;
+		itx->itx_wr_state = wr_state;
 		lr->lr_foid = zp->z_id;
 		lr->lr_offset = off;
 		lr->lr_length = len;

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c	Fri Jun 30 07:48:08 2017	(r320495)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c	Fri Jun 30 09:34:50 2017	(r320496)
@@ -90,6 +90,15 @@ TUNABLE_INT("vfs.zfs.trim.enabled", &zfs_trim_enabled)
 SYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, &zfs_trim_enabled, 0,
     "Enable ZFS TRIM");
 
+/*
+ * Limit SLOG write size per commit executed with synchronous priority.
+ * Any writes above that executed with lower (asynchronous) priority to
+ * limit potential SLOG device abuse by single active ZIL writer.
+ */
+uint64_t zil_slog_limit = 768 * 1024;
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_limit, CTLFLAG_RWTUN,
+    &zil_slog_limit, 0, "Maximal SLOG commit size with sync priority");
+
 static kmem_cache_t *zil_lwb_cache;
 
 #define	LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
@@ -449,13 +458,14 @@ zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *t
 }
 
 static lwb_t *
-zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg)
+zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg)
 {
 	lwb_t *lwb;
 
 	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
 	lwb->lwb_zilog = zilog;
 	lwb->lwb_blk = *bp;
+	lwb->lwb_slog = slog;
 	lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
 	lwb->lwb_max_txg = txg;
 	lwb->lwb_zio = NULL;
@@ -539,6 +549,7 @@ zil_create(zilog_t *zilog)
 	dmu_tx_t *tx = NULL;
 	blkptr_t blk;
 	int error = 0;
+	boolean_t slog = FALSE;
 
 	/*
 	 * Wait for any previous destroy to complete.
@@ -567,7 +578,7 @@ zil_create(zilog_t *zilog)
 		}
 
 		error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
-		    ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
+		    ZIL_MIN_BLKSZ, &slog);
 
 		if (error == 0)
 			zil_init_log_chain(zilog, &blk);
@@ -577,7 +588,7 @@ zil_create(zilog_t *zilog)
 	 * Allocate a log write buffer (lwb) for the first log block.
 	 */
 	if (error == 0)
-		lwb = zil_alloc_lwb(zilog, &blk, txg);
+		lwb = zil_alloc_lwb(zilog, &blk, slog, txg);
 
 	/*
 	 * If we just allocated the first log block, commit our transaction
@@ -910,6 +921,7 @@ static void
 zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
 {
 	zbookmark_phys_t zb;
+	zio_priority_t prio;
 
 	SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
 	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
@@ -920,9 +932,13 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
 		    ZIO_FLAG_CANFAIL);
 	}
 	if (lwb->lwb_zio == NULL) {
+		if (zilog->zl_cur_used <= zil_slog_limit || !lwb->lwb_slog)
+			prio = ZIO_PRIORITY_SYNC_WRITE;
+		else
+			prio = ZIO_PRIORITY_ASYNC_WRITE;
 		lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
 		    0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
-		    zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE,
+		    zil_lwb_write_done, lwb, prio,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
 	}
 }
@@ -942,16 +958,6 @@ uint64_t zil_block_buckets[] = {
 };
 
 /*
- * Use the slog as long as the logbias is 'latency' and the current commit size
- * is less than the limit or the total list size is less than 2X the limit.
- * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
- */
-uint64_t zil_slog_limit = 1024 * 1024;
-#define	USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \
-	(((zilog)->zl_cur_used < zil_slog_limit) || \
-	((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))))
-
-/*
  * Start a log block write and advance to the next log block.
  * Calls are serialized.
  */
@@ -966,6 +972,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb, boolea
 	uint64_t txg;
 	uint64_t zil_blksz, wsz;
 	int i, error;
+	boolean_t slog;
 
 	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
 		zilc = (zil_chain_t *)lwb->lwb_buf;
@@ -1022,8 +1029,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb, boolea
 
 	BP_ZERO(bp);
 	/* pass the old blkptr in order to spread log blocks across devs */
-	error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz,
-	    USE_SLOG(zilog));
+	error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
 	if (error == 0) {
 		ASSERT3U(bp->blk_birth, ==, txg);
 		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
@@ -1032,7 +1038,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb, boolea
 		/*
 		 * Allocate a new log write buffer (lwb).
 		 */
-		nlwb = zil_alloc_lwb(zilog, bp, txg);
+		nlwb = zil_alloc_lwb(zilog, bp, slog, txg);
 
 		/* Record the block for later vdev flushing */
 		zil_add_block(zilog, &lwb->lwb_blk);
@@ -1071,12 +1077,13 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb, boolea
 static lwb_t *
 zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 {
-	lr_t *lrc = &itx->itx_lr; /* common log record */
-	lr_write_t *lrw = (lr_write_t *)lrc;
+	lr_t *lrcb, *lrc = &itx->itx_lr; /* common log record */
+	lr_write_t *lrwb, *lrw = (lr_write_t *)lrc;
 	char *lr_buf;
 	uint64_t txg = lrc->lrc_txg;
 	uint64_t reclen = lrc->lrc_reclen;
 	uint64_t dlen = 0;
+	uint64_t dnow, lwb_sp;
 
 	if (lwb == NULL)
 		return (NULL);
@@ -1091,25 +1098,30 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 
 	zil_lwb_write_init(zilog, lwb);
 
+cont:
 	/*
 	 * If this record won't fit in the current log block, start a new one.
+	 * For WR_NEED_COPY optimize layout for minimal number of chunks, but
+	 * try to keep wasted space withing reasonable range (12%).
 	 */
-	if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
+	lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+	if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
+	    lwb_sp < ZIL_MAX_LOG_DATA / 8 && (dlen % ZIL_MAX_LOG_DATA == 0 ||
+	    lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) {
 		lwb = zil_lwb_write_start(zilog, lwb, B_FALSE);
 		if (lwb == NULL)
 			return (NULL);
 		zil_lwb_write_init(zilog, lwb);
 		ASSERT(LWB_EMPTY(lwb));
-		if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
-			txg_wait_synced(zilog->zl_dmu_pool, txg);
-			return (lwb);
-		}
+		lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+		ASSERT3U(reclen + MIN(dlen, sizeof(uint64_t)), <=, lwb_sp);
 	}
 
+	dnow = MIN(dlen, lwb_sp - reclen);
 	lr_buf = lwb->lwb_buf + lwb->lwb_nused;
 	bcopy(lrc, lr_buf, reclen);
-	lrc = (lr_t *)lr_buf;
-	lrw = (lr_write_t *)lrc;
+	lrcb = (lr_t *)lr_buf;
+	lrwb = (lr_write_t *)lrcb;
 
 	/*
 	 * If it's a write, fetch the data or get its blkptr as appropriate.
@@ -1121,16 +1133,19 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 			char *dbuf;
 			int error;
 
-			if (dlen) {
-				ASSERT(itx->itx_wr_state == WR_NEED_COPY);
+			if (itx->itx_wr_state == WR_NEED_COPY) {
 				dbuf = lr_buf + reclen;
-				lrw->lr_common.lrc_reclen += dlen;
+				lrcb->lrc_reclen += dnow;
+				if (lrwb->lr_length > dnow)
+					lrwb->lr_length = dnow;
+				lrw->lr_offset += dnow;
+				lrw->lr_length -= dnow;
 			} else {
 				ASSERT(itx->itx_wr_state == WR_INDIRECT);
 				dbuf = NULL;
 			}
 			error = zilog->zl_get_data(
-			    itx->itx_private, lrw, dbuf, lwb->lwb_zio);
+			    itx->itx_private, lrwb, dbuf, lwb->lwb_zio);
 			if (error == EIO) {
 				txg_wait_synced(zilog->zl_dmu_pool, txg);
 				return (lwb);
@@ -1149,12 +1164,18 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 	 * equal to the itx sequence number because not all transactions
 	 * are synchronous, and sometimes spa_sync() gets there first.
 	 */
-	lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
-	lwb->lwb_nused += reclen + dlen;
+	lrcb->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
+	lwb->lwb_nused += reclen + dnow;
 	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
 	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
 	ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
 
+	dlen -= dnow;
+	if (dlen > 0) {
+		zilog->zl_cur_used += reclen;
+		goto cont;
+	}
+
 	return (lwb);
 }
 
@@ -1168,7 +1189,6 @@ zil_itx_create(uint64_t txtype, size_t lrsize)
 	itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
 	itx->itx_lr.lrc_txtype = txtype;
 	itx->itx_lr.lrc_reclen = lrsize;
-	itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */
 	itx->itx_lr.lrc_seq = 0;	/* defensive */
 	itx->itx_sync = B_TRUE;		/* default is synchronous */
 
@@ -1317,11 +1337,8 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *t
 			 * this itxg. Save the itxs for release below.
 			 * This should be rare.
 			 */
-			atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
-			itxg->itxg_sod = 0;
 			clean = itxg->itxg_itxs;
 		}
-		ASSERT(itxg->itxg_sod == 0);
 		itxg->itxg_txg = txg;
 		itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP);
 
@@ -1333,8 +1350,6 @@ zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *t
 	}
 	if (itx->itx_sync) {
 		list_insert_tail(&itxs->i_sync_list, itx);
-		atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod);
-		itxg->itxg_sod += itx->itx_sod;
 	} else {
 		avl_tree_t *t = &itxs->i_async_tree;
 		uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid;
@@ -1382,8 +1397,6 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg)
 	ASSERT3U(itxg->itxg_txg, <=, synced_txg);
 	ASSERT(itxg->itxg_txg != 0);
 	ASSERT(zilog->zl_clean_taskq != NULL);
-	atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
-	itxg->itxg_sod = 0;
 	clean_me = itxg->itxg_itxs;
 	itxg->itxg_itxs = NULL;
 	itxg->itxg_txg = 0;
@@ -1407,7 +1420,6 @@ zil_get_commit_list(zilog_t *zilog)
 {
 	uint64_t otxg, txg;
 	list_t *commit_list = &zilog->zl_itx_commit_list;
-	uint64_t push_sod = 0;
 
 	if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
 		otxg = ZILTEST_TXG;
@@ -1439,12 +1451,9 @@ zil_get_commit_list(zilog_t *zilog)
 		ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
 		    spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
 		list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
-		push_sod += itxg->itxg_sod;
-		itxg->itxg_sod = 0;
 
 		mutex_exit(&itxg->itxg_lock);
 	}
-	atomic_add_64(&zilog->zl_itx_list_sz, -push_sod);
 }
 
 /*

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c	Fri Jun 30 07:48:08 2017	(r320495)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c	Fri Jun 30 09:34:50 2017	(r320496)
@@ -2943,20 +2943,21 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, bl
  */
 int
 zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
-    uint64_t size, boolean_t use_slog)
+    uint64_t size, boolean_t *slog)
 {
 	int error = 1;
 
 	ASSERT(txg > spa_syncing_txg(spa));
 
-	if (use_slog) {
-		error = metaslab_alloc(spa, spa_log_class(spa), size,
-		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL);
-	}
-
-	if (error) {
+	error = metaslab_alloc(spa, spa_log_class(spa), size,
+	    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL);
+	if (error == 0) {
+		*slog = TRUE;
+	} else {
 		error = metaslab_alloc(spa, spa_normal_class(spa), size,
 		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL);
+		if (error == 0)
+			*slog = FALSE;
 	}
 
 	if (error == 0) {

Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
==============================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c	Fri Jun 30 07:48:08 2017	(r320495)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c	Fri Jun 30 09:34:50 2017	(r320496)
@@ -1362,54 +1362,44 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_
 {
 	uint32_t blocksize = zv->zv_volblocksize;
 	zilog_t *zilog = zv->zv_zilog;
-	boolean_t slogging;
-	ssize_t immediate_write_sz;
+	itx_wr_state_t write_state;
 
 	if (zil_replaying(zilog, tx))
 		return;
 
-	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
-	    ? 0 : zvol_immediate_write_sz;
+	if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+		write_state = WR_INDIRECT;
+	else if (!spa_has_slogs(zilog->zl_spa) &&
+	    resid >= blocksize && blocksize > zvol_immediate_write_sz)
+		write_state = WR_INDIRECT;
+	else if (sync)
+		write_state = WR_COPIED;
+	else
+		write_state = WR_NEED_COPY;
 
-	slogging = spa_has_slogs(zilog->zl_spa) &&
-	    (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
-
 	while (resid) {
 		itx_t *itx;
 		lr_write_t *lr;
-		ssize_t len;
-		itx_wr_state_t write_state;
+		itx_wr_state_t wr_state = write_state;
+		ssize_t len = resid;
 
-		/*
-		 * Unlike zfs_log_write() we can be called with
-		 * upto DMU_MAX_ACCESS/2 (5MB) writes.
-		 */
-		if (blocksize > immediate_write_sz && !slogging &&
-		    resid >= blocksize && off % blocksize == 0) {
-			write_state = WR_INDIRECT; /* uses dmu_sync */
-			len = blocksize;
-		} else if (sync) {
-			write_state = WR_COPIED;
-			len = MIN(ZIL_MAX_LOG_DATA, resid);
-		} else {
-			write_state = WR_NEED_COPY;
-			len = MIN(ZIL_MAX_LOG_DATA, resid);
-		}
+		if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
+			wr_state = WR_NEED_COPY;
+		else if (wr_state == WR_INDIRECT)
+			len = MIN(blocksize - P2PHASE(off, blocksize), resid);
 
 		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
-		    (write_state == WR_COPIED ? len : 0));
+		    (wr_state == WR_COPIED ? len : 0));
 		lr = (lr_write_t *)&itx->itx_lr;
-		if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
+		if (wr_state == WR_COPIED && dmu_read(zv->zv_objset,
 		    ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
 			zil_itx_destroy(itx);
 			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
 			lr = (lr_write_t *)&itx->itx_lr;
-			write_state = WR_NEED_COPY;
+			wr_state = WR_NEED_COPY;
 		}
 
-		itx->itx_wr_state = write_state;
-		if (write_state == WR_NEED_COPY)
-			itx->itx_sod += len;
+		itx->itx_wr_state = wr_state;
 		lr->lr_foid = ZVOL_OBJ;
 		lr->lr_offset = off;
 		lr->lr_length = len;



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201706300934.v5U9Yo8k026872>