From owner-svn-src-stable-10@FreeBSD.ORG Thu Aug 21 22:44:11 2014 Return-Path: Delivered-To: svn-src-stable-10@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [8.8.178.115]) (using TLSv1 with cipher ADH-AES256-SHA (256/256 bits)) (No client certificate requested) by hub.freebsd.org (Postfix) with ESMTPS id 5AB92D13; Thu, 21 Aug 2014 22:44:11 +0000 (UTC) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id 43DCE308F; Thu, 21 Aug 2014 22:44:11 +0000 (UTC) Received: from svn.freebsd.org ([127.0.1.70]) by svn.freebsd.org (8.14.9/8.14.9) with ESMTP id s7LMiBS4033533; Thu, 21 Aug 2014 22:44:11 GMT (envelope-from smh@FreeBSD.org) Received: (from smh@localhost) by svn.freebsd.org (8.14.9/8.14.9/Submit) id s7LMi8L4033507; Thu, 21 Aug 2014 22:44:08 GMT (envelope-from smh@FreeBSD.org) Message-Id: <201408212244.s7LMi8L4033507@svn.freebsd.org> X-Authentication-Warning: svn.freebsd.org: smh set sender to smh@FreeBSD.org using -f From: Steven Hartland Date: Thu, 21 Aug 2014 22:44:08 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-10@freebsd.org Subject: svn commit: r270312 - in stable/10/sys/cddl: compat/opensolaris/sys contrib/opensolaris/uts/common/fs/zfs contrib/opensolaris/uts/common/fs/zfs/sys X-SVN-Group: stable-10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-stable-10@freebsd.org X-Mailman-Version: 2.1.18-1 Precedence: list List-Id: SVN commit messages for only the 10-stable src tree List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 21 Aug 2014 22:44:11 -0000 Author: smh Date: Thu Aug 21 22:44:08 2014 New Revision: 270312 URL: http://svnweb.freebsd.org/changeset/base/270312 Log: MFC r265152 - Reintroduce priority for the TRIM ZIOs instead of using the "NOW" priority MFC r265321 - Fix double fault panic when returning EOPNOTSUPP MFC r269407 - Don't return ZIO_PIPELINE_CONTINUE from vdev_op_io_start methods Sponsored by: Multiplay Modified: stable/10/sys/cddl/compat/opensolaris/sys/dkio.h stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c Directory Properties: stable/10/ (props changed) Modified: stable/10/sys/cddl/compat/opensolaris/sys/dkio.h ============================================================================== --- stable/10/sys/cddl/compat/opensolaris/sys/dkio.h Thu Aug 21 22:42:02 2014 (r270311) +++ stable/10/sys/cddl/compat/opensolaris/sys/dkio.h Thu Aug 21 22:44:08 2014 (r270312) @@ -75,8 +75,6 @@ extern "C" { */ #define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */ -#define DKIOCTRIM (DKIOC|35) /* TRIM a block */ - struct dk_callback { void (*dkc_callback)(void *dkc_cookie, int error); void *dkc_cookie; Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h ============================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h Thu Aug 21 22:42:02 2014 (r270311) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h Thu Aug 21 22:44:08 2014 (r270312) @@ -146,9 +146,10 @@ typedef enum zio_priority { ZIO_PRIORITY_ASYNC_READ, /* prefetch */ ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ + ZIO_PRIORITY_TRIM, /* free requests used for TRIM */ ZIO_PRIORITY_NUM_QUEUEABLE, - ZIO_PRIORITY_NOW /* non-queued i/os (e.g. free) */ + ZIO_PRIORITY_NOW /* non-queued I/Os (e.g. ioctl) */ } zio_priority_t; #define ZIO_PIPELINE_CONTINUE 0x100 @@ -361,7 +362,7 @@ typedef struct zio_transform { struct zio_transform *zt_next; } zio_transform_t; -typedef int zio_pipe_stage_t(zio_t **ziop); +typedef int zio_pipe_stage_t(zio_t *zio); /* * The io_reexecute flags are distinct from io_flags because the child must @@ -520,7 +521,7 @@ extern zio_t *zio_claim(zio_t *pio, spa_ extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv, - enum zio_flag flags); + zio_priority_t priority, enum zio_flag flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h ============================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h Thu Aug 21 22:42:02 2014 (r270311) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h Thu Aug 21 22:44:08 2014 (r270312) @@ -215,6 +215,10 @@ enum zio_stage { ZIO_STAGE_FREE_BP_INIT | \ ZIO_STAGE_DVA_FREE) +#define ZIO_FREE_PHYS_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES) + #define ZIO_DDT_FREE_PIPELINE \ (ZIO_INTERLOCK_STAGES | \ ZIO_STAGE_FREE_BP_INIT | \ Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c ============================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c Thu Aug 21 22:42:02 2014 (r270311) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c Thu Aug 21 22:44:08 2014 (r270312) @@ -449,7 +449,7 @@ trim_map_vdev_commit(spa_t *spa, zio_t * { trim_map_t *tm = vd->vdev_trimmap; trim_seg_t *ts; - uint64_t size, txgtarget, txgsafe; + uint64_t size, offset, txgtarget, txgsafe; hrtime_t timelimit; ASSERT(vd->vdev_ops->vdev_op_leaf); @@ -477,9 +477,20 @@ trim_map_vdev_commit(spa_t *spa, zio_t * avl_remove(&tm->tm_queued_frees, ts); avl_add(&tm->tm_inflight_frees, ts); size = ts->ts_end - ts->ts_start; - zio_nowait(zio_trim(zio, spa, vd, ts->ts_start, size)); + offset = ts->ts_start; TRIM_MAP_SDEC(tm, size); TRIM_MAP_QDEC(tm); + /* + * We drop the lock while we call zio_nowait as the IO + * scheduler can result in a different IO being run e.g. + * a write which would result in a recursive lock. + */ + mutex_exit(&tm->tm_lock); + + zio_nowait(zio_trim(zio, spa, vd, offset, size)); + + mutex_enter(&tm->tm_lock); + ts = trim_map_first(tm, txgtarget, txgsafe, timelimit); } mutex_exit(&tm->tm_lock); } Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c ============================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c Thu Aug 21 22:42:02 2014 (r270311) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c Thu Aug 21 22:44:08 2014 (r270312) @@ -684,7 +684,7 @@ vdev_disk_io_intr(buf_t *bp) * Rather than teach the rest of the stack about other error * possibilities (EFAULT, etc), we normalize the error value here. */ - zio->io_error = (geterror(bp) != 0 ? EIO : 0); + zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0); if (zio->io_error == 0 && bp->b_resid != 0) zio->io_error = SET_ERROR(EIO); @@ -730,15 +730,17 @@ vdev_disk_io_start(zio_t *zio) * Nothing to be done here but return failure. */ if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) { - zio->io_error = ENXIO; - return (ZIO_PIPELINE_CONTINUE); + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return (ZIO_PIPELINE_STOP); } if (zio->io_type == ZIO_TYPE_IOCTL) { /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); - return (ZIO_PIPELINE_CONTINUE); + zio_interrupt(zio); + return (ZIO_PIPELINE_STOP); } switch (zio->io_cmd) { @@ -790,7 +792,8 @@ vdev_disk_io_start(zio_t *zio) zio->io_error = SET_ERROR(ENOTSUP); } - return (ZIO_PIPELINE_CONTINUE); + zio_interrupt(zio); + return (ZIO_PIPELINE_STOP); } vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP); Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c ============================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c Thu Aug 21 22:42:02 2014 (r270311) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c Thu Aug 21 22:44:08 2014 (r270312) @@ -164,7 +164,8 @@ vdev_file_io_start(zio_t *zio) if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); - return (ZIO_PIPELINE_CONTINUE); + zio_interrupt(zio); + return (ZIO_PIPELINE_STOP); } vf = vd->vdev_tsd; @@ -180,7 +181,8 @@ vdev_file_io_start(zio_t *zio) zio->io_error = SET_ERROR(ENOTSUP); } - return (ZIO_PIPELINE_CONTINUE); + zio_interrupt(zio); + return (ZIO_PIPELINE_STOP); } zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ? Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c ============================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c Thu Aug 21 22:42:02 2014 (r270311) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c Thu Aug 21 22:44:08 2014 (r270312) @@ -716,7 +716,7 @@ vdev_geom_io_intr(struct bio *bp) vd = zio->io_vd; zio->io_error = bp->bio_error; if (zio->io_error == 0 && bp->bio_resid != 0) - zio->io_error = EIO; + zio->io_error = SET_ERROR(EIO); switch(zio->io_error) { case ENOTSUP: @@ -765,41 +765,43 @@ vdev_geom_io_start(zio_t *zio) vd = zio->io_vd; - if (zio->io_type == ZIO_TYPE_IOCTL) { + switch (zio->io_type) { + case ZIO_TYPE_IOCTL: /* XXPOLICY */ if (!vdev_readable(vd)) { - zio->io_error = ENXIO; - return (ZIO_PIPELINE_CONTINUE); + zio->io_error = SET_ERROR(ENXIO); + } else { + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + if (zfs_nocacheflush || vdev_geom_bio_flush_disable) + break; + if (vd->vdev_nowritecache) { + zio->io_error = SET_ERROR(ENOTSUP); + break; + } + goto sendreq; + default: + zio->io_error = SET_ERROR(ENOTSUP); + } } - switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - if (zfs_nocacheflush || vdev_geom_bio_flush_disable) - break; - if (vd->vdev_nowritecache) { - zio->io_error = ENOTSUP; - break; - } + zio_interrupt(zio); + return (ZIO_PIPELINE_STOP); + case ZIO_TYPE_FREE: + if (vd->vdev_notrim) { + zio->io_error = SET_ERROR(ENOTSUP); + } else if (!vdev_geom_bio_delete_disable) { goto sendreq; - case DKIOCTRIM: - if (vdev_geom_bio_delete_disable) - break; - if (vd->vdev_notrim) { - zio->io_error = ENOTSUP; - break; - } - goto sendreq; - default: - zio->io_error = ENOTSUP; } - - return (ZIO_PIPELINE_CONTINUE); + zio_interrupt(zio); + return (ZIO_PIPELINE_STOP); } sendreq: cp = vd->vdev_tsd; if (cp == NULL) { - zio->io_error = ENXIO; - return (ZIO_PIPELINE_CONTINUE); + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return (ZIO_PIPELINE_STOP); } bp = g_alloc_bio(); bp->bio_caller1 = zio; @@ -811,22 +813,18 @@ sendreq: bp->bio_offset = zio->io_offset; bp->bio_length = zio->io_size; break; + case ZIO_TYPE_FREE: + bp->bio_cmd = BIO_DELETE; + bp->bio_data = NULL; + bp->bio_offset = zio->io_offset; + bp->bio_length = zio->io_size; + break; case ZIO_TYPE_IOCTL: - switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - bp->bio_cmd = BIO_FLUSH; - bp->bio_flags |= BIO_ORDERED; - bp->bio_data = NULL; - bp->bio_offset = cp->provider->mediasize; - bp->bio_length = 0; - break; - case DKIOCTRIM: - bp->bio_cmd = BIO_DELETE; - bp->bio_data = NULL; - bp->bio_offset = zio->io_offset; - bp->bio_length = zio->io_size; - break; - } + bp->bio_cmd = BIO_FLUSH; + bp->bio_flags |= BIO_ORDERED; + bp->bio_data = NULL; + bp->bio_offset = cp->provider->mediasize; + bp->bio_length = 0; break; } bp->bio_done = vdev_geom_io_intr; Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c ============================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c Thu Aug 21 22:42:02 2014 (r270311) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c Thu Aug 21 22:44:08 2014 (r270312) @@ -287,7 +287,8 @@ vdev_mirror_io_start(zio_t *zio) zio->io_type, zio->io_priority, 0, vdev_mirror_scrub_done, mc)); } - return (ZIO_PIPELINE_CONTINUE); + zio_interrupt(zio); + return (ZIO_PIPELINE_STOP); } /* * For normal reads just pick one child. @@ -314,7 +315,8 @@ vdev_mirror_io_start(zio_t *zio) c++; } - return (ZIO_PIPELINE_CONTINUE); + zio_interrupt(zio); + return (ZIO_PIPELINE_STOP); } static int Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c ============================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c Thu Aug 21 22:42:02 2014 (r270311) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c Thu Aug 21 22:44:08 2014 (r270312) @@ -71,7 +71,8 @@ static int vdev_missing_io_start(zio_t *zio) { zio->io_error = SET_ERROR(ENOTSUP); - return (ZIO_PIPELINE_CONTINUE); + zio_interrupt(zio); + return (ZIO_PIPELINE_STOP); } /* ARGSUSED */ Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c ============================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c Thu Aug 21 22:42:02 2014 (r270311) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c Thu Aug 21 22:44:08 2014 (r270312) @@ -40,9 +40,9 @@ * * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios. The * I/O scheduler determines when and in what order those operations are - * issued. The I/O scheduler divides operations into five I/O classes + * issued. The I/O scheduler divides operations into six I/O classes * prioritized in the following order: sync read, sync write, async read, - * async write, and scrub/resilver. Each queue defines the minimum and + * async write, scrub/resilver and trim. Each queue defines the minimum and * maximum number of concurrent operations that may be issued to the device. * In addition, the device has an aggregate maximum. Note that the sum of the * per-queue minimums must not exceed the aggregate maximum, and if the @@ -61,7 +61,7 @@ * done in the order specified above. No further operations are issued if the * aggregate maximum number of concurrent operations has been hit or if there * are no operations queued for an I/O class that has not hit its maximum. - * Every time an i/o is queued or an operation completes, the I/O scheduler + * Every time an I/O is queued or an operation completes, the I/O scheduler * looks for new operations to issue. * * All I/O classes have a fixed maximum number of outstanding operations @@ -70,7 +70,7 @@ * transaction groups (see txg.c). Transaction groups enter the syncing state * periodically so the number of queued async writes will quickly burst up and * then bleed down to zero. Rather than servicing them as quickly as possible, - * the I/O scheduler changes the maximum number of active async write i/os + * the I/O scheduler changes the maximum number of active async write I/Os * according to the amount of dirty data in the pool (see dsl_pool.c). Since * both throughput and latency typically increase with the number of * concurrent operations issued to physical devices, reducing the burstiness @@ -113,14 +113,14 @@ */ /* - * The maximum number of i/os active to each device. Ideally, this will be >= + * The maximum number of I/Os active to each device. Ideally, this will be >= * the sum of each queue's max_active. It must be at least the sum of each * queue's min_active. */ uint32_t zfs_vdev_max_active = 1000; /* - * Per-queue limits on the number of i/os active to each device. If the + * Per-queue limits on the number of I/Os active to each device. If the * sum of the queue's max_active is < zfs_vdev_max_active, then the * min_active comes into play. We will send min_active from each queue, * and then select from queues in the order defined by zio_priority_t. @@ -145,6 +145,14 @@ uint32_t zfs_vdev_async_write_min_active uint32_t zfs_vdev_async_write_max_active = 10; uint32_t zfs_vdev_scrub_min_active = 1; uint32_t zfs_vdev_scrub_max_active = 2; +uint32_t zfs_vdev_trim_min_active = 1; +/* + * TRIM max active is large in comparison to the other values due to the fact + * that TRIM IOs are coalesced at the device layer. This value is set such + * that a typical SSD can process the queued IOs in a single request. + */ +uint32_t zfs_vdev_trim_max_active = 64; + /* * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent @@ -171,7 +179,7 @@ SYSCTL_DECL(_vfs_zfs_vdev); TUNABLE_INT("vfs.zfs.vdev.max_active", &zfs_vdev_max_active); SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, max_active, CTLFLAG_RW, &zfs_vdev_max_active, 0, - "The maximum number of i/os of all types active for each device."); + "The maximum number of I/Os of all types active for each device."); #define ZFS_VDEV_QUEUE_KNOB_MIN(name) \ TUNABLE_INT("vfs.zfs.vdev." #name "_min_active", \ @@ -199,6 +207,8 @@ ZFS_VDEV_QUEUE_KNOB_MIN(async_write); ZFS_VDEV_QUEUE_KNOB_MAX(async_write); ZFS_VDEV_QUEUE_KNOB_MIN(scrub); ZFS_VDEV_QUEUE_KNOB_MAX(scrub); +ZFS_VDEV_QUEUE_KNOB_MIN(trim); +ZFS_VDEV_QUEUE_KNOB_MAX(trim); #undef ZFS_VDEV_QUEUE_KNOB @@ -297,6 +307,7 @@ static void vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; + ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_add(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio); @@ -313,6 +324,7 @@ static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; + ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_remove(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio); @@ -401,6 +413,8 @@ vdev_queue_class_min_active(zio_priority return (zfs_vdev_async_write_min_active); case ZIO_PRIORITY_SCRUB: return (zfs_vdev_scrub_min_active); + case ZIO_PRIORITY_TRIM: + return (zfs_vdev_trim_min_active); default: panic("invalid priority %u", p); return (0); @@ -460,6 +474,8 @@ vdev_queue_class_max_active(spa_t *spa, return (vdev_queue_max_async_writes(spa)); case ZIO_PRIORITY_SCRUB: return (zfs_vdev_scrub_max_active); + case ZIO_PRIORITY_TRIM: + return (zfs_vdev_trim_max_active); default: panic("invalid priority %u", p); return (0); @@ -476,6 +492,8 @@ vdev_queue_class_to_issue(vdev_queue_t * spa_t *spa = vq->vq_vdev->vdev_spa; zio_priority_t p; + ASSERT(MUTEX_HELD(&vq->vq_lock)); + if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) return (ZIO_PRIORITY_NUM_QUEUEABLE); @@ -517,10 +535,11 @@ vdev_queue_aggregate(vdev_queue_t *vq, z zio_t *first, *last, *aio, *dio, *mandatory, *nio; uint64_t maxgap = 0; uint64_t size; - boolean_t stretch = B_FALSE; - vdev_queue_class_t *vqc = &vq->vq_class[zio->io_priority]; - avl_tree_t *t = &vqc->vqc_queued_tree; - enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; + boolean_t stretch; + avl_tree_t *t; + enum zio_flag flags; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE) return (NULL); @@ -558,6 +577,8 @@ vdev_queue_aggregate(vdev_queue_t *vq, z * Walk backwards through sufficiently contiguous I/Os * recording the last non-option I/O. */ + flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; + t = &vq->vq_class[zio->io_priority].vqc_queued_tree; while ((dio = AVL_PREV(t, first)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit && @@ -597,6 +618,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, z * non-optional I/O is close enough to make aggregation * worthwhile. */ + stretch = B_FALSE; if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) { zio_t *nio = last; while ((dio = AVL_NEXT(t, nio)) != NULL && @@ -737,11 +759,13 @@ vdev_queue_io(zio_t *zio) zio->io_priority != ZIO_PRIORITY_ASYNC_READ && zio->io_priority != ZIO_PRIORITY_SCRUB) zio->io_priority = ZIO_PRIORITY_ASYNC_READ; - } else { - ASSERT(zio->io_type == ZIO_TYPE_WRITE); + } else if (zio->io_type == ZIO_TYPE_WRITE) { if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE) zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; + } else { + ASSERT(zio->io_type == ZIO_TYPE_FREE); + zio->io_priority = ZIO_PRIORITY_TRIM; } zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c ============================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c Thu Aug 21 22:42:02 2014 (r270311) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c Thu Aug 21 22:44:08 2014 (r270312) @@ -1755,7 +1755,9 @@ vdev_raidz_io_start(zio_t *zio) zio->io_type, zio->io_priority, 0, vdev_raidz_child_done, rc)); } - return (ZIO_PIPELINE_CONTINUE); + + zio_interrupt(zio); + return (ZIO_PIPELINE_STOP); } if (zio->io_type == ZIO_TYPE_WRITE) { @@ -1787,7 +1789,8 @@ vdev_raidz_io_start(zio_t *zio) ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); } - return (ZIO_PIPELINE_CONTINUE); + zio_interrupt(zio); + return (ZIO_PIPELINE_STOP); } ASSERT(zio->io_type == ZIO_TYPE_READ); @@ -1827,7 +1830,8 @@ vdev_raidz_io_start(zio_t *zio) } } - return (ZIO_PIPELINE_CONTINUE); + zio_interrupt(zio); + return (ZIO_PIPELINE_STOP); } Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c ============================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c Thu Aug 21 22:42:02 2014 (r270311) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c Thu Aug 21 22:44:08 2014 (r270312) @@ -807,6 +807,8 @@ zio_free_sync(zio_t *pio, spa_t *spa, ui else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) stage |= ZIO_STAGE_ISSUE_ASYNC; + flags |= ZIO_FLAG_DONT_QUEUE; + zio = zio_create(pio, spa, txg, bp, NULL, size, NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage); @@ -851,14 +853,14 @@ zio_claim(zio_t *pio, spa_t *spa, uint64 zio_t * zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset, uint64_t size, zio_done_func_t *done, void *private, - enum zio_flag flags) + zio_priority_t priority, enum zio_flag flags) { zio_t *zio; int c; if (vd->vdev_children == 0) { zio = zio_create(pio, spa, 0, NULL, NULL, size, done, private, - ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, offset, NULL, + ZIO_TYPE_IOCTL, priority, flags, vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); zio->io_cmd = cmd; @@ -867,7 +869,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t for (c = 0; c < vd->vdev_children; c++) zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, - offset, size, done, private, flags)); + offset, size, done, private, priority, flags)); } return (zio); @@ -952,6 +954,10 @@ zio_vdev_child_io(zio_t *pio, blkptr_t * pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } + /* Not all IO types require vdev io done stage e.g. free */ + if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE)) + pipeline &= ~ZIO_STAGE_VDEV_IO_DONE; + if (vd->vdev_children == 0) offset += VDEV_LABEL_START_SIZE; @@ -997,7 +1003,7 @@ void zio_flush(zio_t *zio, vdev_t *vd) { zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, 0, 0, - NULL, NULL, + NULL, NULL, ZIO_PRIORITY_NOW, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); } @@ -1007,9 +1013,10 @@ zio_trim(zio_t *zio, spa_t *spa, vdev_t ASSERT(vd->vdev_ops->vdev_op_leaf); - return zio_ioctl(zio, spa, vd, DKIOCTRIM, offset, size, - NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY); + return (zio_create(zio, spa, 0, NULL, NULL, size, NULL, NULL, + ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE | + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, + vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE)); } void @@ -1036,9 +1043,8 @@ zio_shrink(zio_t *zio, uint64_t size) */ static int -zio_read_bp_init(zio_t **ziop) +zio_read_bp_init(zio_t *zio) { - zio_t *zio = *ziop; blkptr_t *bp = zio->io_bp; if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && @@ -1071,9 +1077,8 @@ zio_read_bp_init(zio_t **ziop) } static int -zio_write_bp_init(zio_t **ziop) +zio_write_bp_init(zio_t *zio) { - zio_t *zio = *ziop; spa_t *spa = zio->io_spa; zio_prop_t *zp = &zio->io_prop; enum zio_compress compress = zp->zp_compress; @@ -1253,9 +1258,8 @@ zio_write_bp_init(zio_t **ziop) } static int -zio_free_bp_init(zio_t **ziop) +zio_free_bp_init(zio_t *zio) { - zio_t *zio = *ziop; blkptr_t *bp = zio->io_bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) { @@ -1338,10 +1342,8 @@ zio_taskq_member(zio_t *zio, zio_taskq_t } static int -zio_issue_async(zio_t **ziop) +zio_issue_async(zio_t *zio) { - zio_t *zio = *ziop; - zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (ZIO_PIPELINE_STOP); @@ -1409,7 +1411,7 @@ zio_execute(zio_t *zio) } zio->io_stage = stage; - rv = zio_pipeline[highbit64(stage) - 1](&zio); + rv = zio_pipeline[highbit64(stage) - 1](zio); if (rv == ZIO_PIPELINE_STOP) return; @@ -1843,9 +1845,8 @@ zio_gang_tree_issue(zio_t *pio, zio_gang } static int -zio_gang_assemble(zio_t **ziop) +zio_gang_assemble(zio_t *zio) { - zio_t *zio = *ziop; blkptr_t *bp = zio->io_bp; ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); @@ -1859,9 +1860,8 @@ zio_gang_assemble(zio_t **ziop) } static int -zio_gang_issue(zio_t **ziop) +zio_gang_issue(zio_t *zio) { - zio_t *zio = *ziop; blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) @@ -1995,9 +1995,8 @@ zio_write_gang_block(zio_t *pio) * writes) and as a result is mutually exclusive with dedup. */ static int -zio_nop_write(zio_t **ziop) +zio_nop_write(zio_t *zio) { - zio_t *zio = *ziop; blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; zio_prop_t *zp = &zio->io_prop; @@ -2068,9 +2067,8 @@ zio_ddt_child_read_done(zio_t *zio) } static int -zio_ddt_read_start(zio_t **ziop) +zio_ddt_read_start(zio_t *zio) { - zio_t *zio = *ziop; blkptr_t *bp = zio->io_bp; ASSERT(BP_GET_DEDUP(bp)); @@ -2112,9 +2110,8 @@ zio_ddt_read_start(zio_t **ziop) } static int -zio_ddt_read_done(zio_t **ziop) +zio_ddt_read_done(zio_t *zio) { - zio_t *zio = *ziop; blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) @@ -2282,9 +2279,8 @@ zio_ddt_ditto_write_done(zio_t *zio) } static int -zio_ddt_write(zio_t **ziop) +zio_ddt_write(zio_t *zio) { - zio_t *zio = *ziop; spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; uint64_t txg = zio->io_txg; @@ -2395,9 +2391,8 @@ zio_ddt_write(zio_t **ziop) ddt_entry_t *freedde; /* for debugging */ static int -zio_ddt_free(zio_t **ziop) +zio_ddt_free(zio_t *zio) { - zio_t *zio = *ziop; spa_t *spa = zio->io_spa; blkptr_t *bp = zio->io_bp; ddt_t *ddt = ddt_select(spa, bp); @@ -2422,9 +2417,8 @@ zio_ddt_free(zio_t **ziop) * ========================================================================== */ static int -zio_dva_allocate(zio_t **ziop) +zio_dva_allocate(zio_t *zio) { - zio_t *zio = *ziop; spa_t *spa = zio->io_spa; metaslab_class_t *mc = spa_normal_class(spa); blkptr_t *bp = zio->io_bp; @@ -2466,19 +2460,16 @@ zio_dva_allocate(zio_t **ziop) } static int -zio_dva_free(zio_t **ziop) +zio_dva_free(zio_t *zio) { - zio_t *zio = *ziop; - metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); return (ZIO_PIPELINE_CONTINUE); } static int -zio_dva_claim(zio_t **ziop) +zio_dva_claim(zio_t *zio) { - zio_t *zio = *ziop; int error; error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); @@ -2572,12 +2563,12 @@ zio_free_zil(spa_t *spa, uint64_t txg, b * ========================================================================== */ static int -zio_vdev_io_start(zio_t **ziop) +zio_vdev_io_start(zio_t *zio) { - zio_t *zio = *ziop; vdev_t *vd = zio->io_vd; uint64_t align; spa_t *spa = zio->io_spa; + int ret; ASSERT(zio->io_error == 0); ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); @@ -2592,7 +2583,8 @@ zio_vdev_io_start(zio_t **ziop) return (vdev_mirror_ops.vdev_op_io_start(zio)); } - if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) { + if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE && + zio->io_priority == ZIO_PRIORITY_NOW) { trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg); return (ZIO_PIPELINE_CONTINUE); } @@ -2677,41 +2669,44 @@ zio_vdev_io_start(zio_t **ziop) return (ZIO_PIPELINE_CONTINUE); } - if (vd->vdev_ops->vdev_op_leaf && - (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { - - if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) - return (ZIO_PIPELINE_CONTINUE); + if (vd->vdev_ops->vdev_op_leaf) { + switch (zio->io_type) { + case ZIO_TYPE_READ: + if (vdev_cache_read(zio)) + return (ZIO_PIPELINE_CONTINUE); + /* FALLTHROUGH */ + case ZIO_TYPE_WRITE: + case ZIO_TYPE_FREE: + if ((zio = vdev_queue_io(zio)) == NULL) + return (ZIO_PIPELINE_STOP); - if ((zio = vdev_queue_io(zio)) == NULL) - return (ZIO_PIPELINE_STOP); - *ziop = zio; - - if (!vdev_accessible(vd, zio)) { - zio->io_error = SET_ERROR(ENXIO); - zio_interrupt(zio); - return (ZIO_PIPELINE_STOP); + if (!vdev_accessible(vd, zio)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return (ZIO_PIPELINE_STOP); + } + break; } - } - - /* - * Note that we ignore repair writes for TRIM because they can conflict - * with normal writes. This isn't an issue because, by definition, we - * only repair blocks that aren't freed. - */ - if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE && - !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { - if (!trim_map_write_start(zio)) + /* + * Note that we ignore repair writes for TRIM because they can + * conflict with normal writes. This isn't an issue because, by + * definition, we only repair blocks that aren't freed. + */ + if (zio->io_type == ZIO_TYPE_WRITE && + !(zio->io_flags & ZIO_FLAG_IO_REPAIR) && + !trim_map_write_start(zio)) return (ZIO_PIPELINE_STOP); } - return (vd->vdev_ops->vdev_op_io_start(zio)); + ret = vd->vdev_ops->vdev_op_io_start(zio); + ASSERT(ret == ZIO_PIPELINE_STOP); + + return (ret); } static int -zio_vdev_io_done(zio_t **ziop) +zio_vdev_io_done(zio_t *zio) { - zio_t *zio = *ziop; vdev_t *vd = zio->io_vd; vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; boolean_t unexpected_error = B_FALSE; @@ -2723,7 +2718,8 @@ zio_vdev_io_done(zio_t **ziop) zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE); if (vd != NULL && vd->vdev_ops->vdev_op_leaf && - (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { + (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || + zio->io_type == ZIO_TYPE_FREE)) { if (zio->io_type == ZIO_TYPE_WRITE && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) @@ -2785,9 +2781,8 @@ zio_vsd_default_cksum_report(zio_t *zio, } static int -zio_vdev_io_assess(zio_t **ziop) +zio_vdev_io_assess(zio_t *zio) { - zio_t *zio = *ziop; vdev_t *vd = zio->io_vd; if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE)) @@ -2804,7 +2799,8 @@ zio_vdev_io_assess(zio_t **ziop) if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_fault_injection(zio, EIO); - if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM) + if (zio->io_type == ZIO_TYPE_FREE && + zio->io_priority != ZIO_PRIORITY_NOW) { switch (zio->io_error) { case 0: ZIO_TRIM_STAT_INCR(bytes, zio->io_size); @@ -2817,6 +2813,7 @@ zio_vdev_io_assess(zio_t **ziop) ZIO_TRIM_STAT_BUMP(failed); break; } + } /* * If the I/O failed, determine whether we should attempt to retry it. @@ -2900,9 +2897,8 @@ zio_vdev_io_bypass(zio_t *zio) * ========================================================================== */ static int -zio_checksum_generate(zio_t **ziop) +zio_checksum_generate(zio_t *zio) { - zio_t *zio = *ziop; blkptr_t *bp = zio->io_bp; enum zio_checksum checksum; @@ -2932,9 +2928,8 @@ zio_checksum_generate(zio_t **ziop) } static int -zio_checksum_verify(zio_t **ziop) +zio_checksum_verify(zio_t *zio) { - zio_t *zio = *ziop; zio_bad_cksum_t info; blkptr_t *bp = zio->io_bp; int error; @@ -3005,9 +3000,8 @@ zio_worst_error(int e1, int e2) * ========================================================================== */ static int -zio_ready(zio_t **ziop) +zio_ready(zio_t *zio) { - zio_t *zio = *ziop; blkptr_t *bp = zio->io_bp; zio_t *pio, *pio_next; @@ -3064,9 +3058,8 @@ zio_ready(zio_t **ziop) } static int -zio_done(zio_t **ziop) +zio_done(zio_t *zio) { - zio_t *zio = *ziop; spa_t *spa = zio->io_spa; zio_t *lio = zio->io_logical; blkptr_t *bp = zio->io_bp;