From owner-svn-src-stable-10@FreeBSD.ORG Wed Jan 28 02:55:21 2015 Return-Path: Delivered-To: svn-src-stable-10@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [8.8.178.115]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by hub.freebsd.org (Postfix) with ESMTPS id 5B02B50F; Wed, 28 Jan 2015 02:55:21 +0000 (UTC) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id 456196B1; Wed, 28 Jan 2015 02:55:21 +0000 (UTC) Received: from svn.freebsd.org ([127.0.1.70]) by svn.freebsd.org (8.14.9/8.14.9) with ESMTP id t0S2tLGt032029; Wed, 28 Jan 2015 02:55:21 GMT (envelope-from mav@FreeBSD.org) Received: (from mav@localhost) by svn.freebsd.org (8.14.9/8.14.9/Submit) id t0S2tLRx032028; Wed, 28 Jan 2015 02:55:21 GMT (envelope-from mav@FreeBSD.org) Message-Id: <201501280255.t0S2tLRx032028@svn.freebsd.org> X-Authentication-Warning: svn.freebsd.org: mav set sender to mav@FreeBSD.org using -f From: Alexander Motin Date: Wed, 28 Jan 2015 02:55:21 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-10@freebsd.org Subject: svn commit: r277818 - stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs X-SVN-Group: stable-10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-stable-10@freebsd.org X-Mailman-Version: 2.1.18-1 Precedence: list List-Id: SVN commit messages for only the 10-stable src tree List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 28 Jan 2015 02:55:21 -0000 Author: mav Date: Wed Jan 28 02:55:20 2015 New Revision: 277818 URL: https://svnweb.freebsd.org/changeset/base/277818 Log: MFC r277169: Reimplement TRIM throttling added in r248577. Previous throttling implementation approached problem from the wrong side. It significantly limited useful delaying of TRIM requests and aggregation potential, while not so much controlled TRIM burstiness under heavy load. With this change random 4K write benchmarks (probably the worst case for TRIM) show me IOPS increase by 20%, average latency reduction by 30%, peak TRIM bursts reduction by 3 times and same peak TRIM map size (memory usage). Also the new logic does not force map size down so heavily, really allowing to keep deleted data for 32 TXG or 30 seconds under moderate load. It was practically impossible with old throttling logic, which pushed map down to only 64 segments. Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c Directory Properties: stable/10/ (props changed) Modified: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c ============================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c Wed Jan 28 02:33:06 2015 (r277817) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c Wed Jan 28 02:55:20 2015 (r277818) @@ -40,17 +40,20 @@ #define TRIM_ZIO_END(vd, offset, size) (offset + \ P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift)) -#define TRIM_MAP_SINC(tm, size) \ - atomic_add_64(&(tm)->tm_bytes, (size)) +/* Maximal segment size for ATA TRIM. */ +#define TRIM_MAP_SIZE_FACTOR (512 << 16) -#define TRIM_MAP_SDEC(tm, size) \ - atomic_add_64(&(tm)->tm_bytes, -(size)) +#define TRIM_MAP_SEGS(size) (1 + (size) / TRIM_MAP_SIZE_FACTOR) -#define TRIM_MAP_QINC(tm) \ - atomic_inc_64(&(tm)->tm_pending); \ - -#define TRIM_MAP_QDEC(tm) \ - atomic_dec_64(&(tm)->tm_pending); +#define TRIM_MAP_ADD(tm, ts) do { \ + list_insert_tail(&(tm)->tm_head, (ts)); \ + (tm)->tm_pending += TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \ +} while (0) + +#define TRIM_MAP_REM(tm, ts) do { \ + list_remove(&(tm)->tm_head, (ts)); \ + (tm)->tm_pending -= TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \ +} while (0) typedef struct trim_map { list_t tm_head; /* List of segments sorted by txg. */ @@ -60,7 +63,6 @@ typedef struct trim_map { list_t tm_pending_writes; /* Writes blocked on in-flight frees. */ kmutex_t tm_lock; uint64_t tm_pending; /* Count of pending TRIMs. */ - uint64_t tm_bytes; /* Total size in bytes of queued TRIMs. */ } trim_map_t; typedef struct trim_seg { @@ -74,13 +76,10 @@ typedef struct trim_seg { extern boolean_t zfs_trim_enabled; -static u_int trim_txg_delay = 32; -static u_int trim_timeout = 30; -static u_int trim_max_interval = 1; -/* Limit outstanding TRIMs to 2G (max size for a single TRIM request) */ -static uint64_t trim_vdev_max_bytes = 2147483648; -/* Limit outstanding TRIMs to 64 (max ranges for a single TRIM request) */ -static u_int trim_vdev_max_pending = 64; +static u_int trim_txg_delay = 32; /* Keep deleted data up to 32 TXG */ +static u_int trim_timeout = 30; /* Keep deleted data up to 30s */ +static u_int trim_max_interval = 1; /* 1s delays between TRIMs */ +static u_int trim_vdev_max_pending = 10000; /* Keep up to 10K segments */ SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RD, 0, "ZFS TRIM"); @@ -99,11 +98,6 @@ SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, max "Maximum interval between TRIM queue processing (seconds)"); SYSCTL_DECL(_vfs_zfs_vdev); -TUNABLE_QUAD("vfs.zfs.vdev.trim_max_bytes", &trim_vdev_max_bytes); -SYSCTL_QUAD(_vfs_zfs_vdev, OID_AUTO, trim_max_bytes, CTLFLAG_RWTUN, - &trim_vdev_max_bytes, 0, - "Maximum pending TRIM bytes for a vdev"); - TUNABLE_INT("vfs.zfs.vdev.trim_max_pending", &trim_vdev_max_pending); SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, trim_max_pending, CTLFLAG_RWTUN, &trim_vdev_max_pending, 0, @@ -198,10 +192,8 @@ trim_map_destroy(vdev_t *vd) mutex_enter(&tm->tm_lock); while ((ts = list_head(&tm->tm_head)) != NULL) { avl_remove(&tm->tm_queued_frees, ts); - list_remove(&tm->tm_head, ts); + TRIM_MAP_REM(tm, ts); kmem_free(ts, sizeof (*ts)); - TRIM_MAP_SDEC(tm, ts->ts_end - ts->ts_start); - TRIM_MAP_QDEC(tm); } mutex_exit(&tm->tm_lock); @@ -246,40 +238,34 @@ trim_map_segment_add(trim_map_t *tm, uin merge_after = (ts_after != NULL && ts_after->ts_start == end); if (merge_before && merge_after) { - TRIM_MAP_SINC(tm, ts_after->ts_start - ts_before->ts_end); - TRIM_MAP_QDEC(tm); avl_remove(&tm->tm_queued_frees, ts_before); - list_remove(&tm->tm_head, ts_before); + TRIM_MAP_REM(tm, ts_before); + TRIM_MAP_REM(tm, ts_after); ts_after->ts_start = ts_before->ts_start; ts_after->ts_txg = txg; ts_after->ts_time = time; - list_remove(&tm->tm_head, ts_after); - list_insert_tail(&tm->tm_head, ts_after); + TRIM_MAP_ADD(tm, ts_after); kmem_free(ts_before, sizeof (*ts_before)); } else if (merge_before) { - TRIM_MAP_SINC(tm, end - ts_before->ts_end); + TRIM_MAP_REM(tm, ts_before); ts_before->ts_end = end; ts_before->ts_txg = txg; ts_before->ts_time = time; - list_remove(&tm->tm_head, ts_before); - list_insert_tail(&tm->tm_head, ts_before); + TRIM_MAP_ADD(tm, ts_before); } else if (merge_after) { - TRIM_MAP_SINC(tm, ts_after->ts_start - start); + TRIM_MAP_REM(tm, ts_after); ts_after->ts_start = start; ts_after->ts_txg = txg; ts_after->ts_time = time; - list_remove(&tm->tm_head, ts_after); - list_insert_tail(&tm->tm_head, ts_after); + TRIM_MAP_ADD(tm, ts_after); } else { - TRIM_MAP_SINC(tm, end - start); - TRIM_MAP_QINC(tm); ts = kmem_alloc(sizeof (*ts), KM_SLEEP); ts->ts_start = start; ts->ts_end = end; ts->ts_txg = txg; ts->ts_time = time; avl_insert(&tm->tm_queued_frees, ts, where); - list_insert_tail(&tm->tm_head, ts); + TRIM_MAP_ADD(tm, ts); } } @@ -295,7 +281,7 @@ trim_map_segment_remove(trim_map_t *tm, left_over = (ts->ts_start < start); right_over = (ts->ts_end > end); - TRIM_MAP_SDEC(tm, end - start); + TRIM_MAP_REM(tm, ts); if (left_over && right_over) { nts = kmem_alloc(sizeof (*nts), KM_SLEEP); nts->ts_start = end; @@ -304,16 +290,16 @@ trim_map_segment_remove(trim_map_t *tm, nts->ts_time = ts->ts_time; ts->ts_end = start; avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER); - list_insert_after(&tm->tm_head, ts, nts); - TRIM_MAP_QINC(tm); + TRIM_MAP_ADD(tm, ts); + TRIM_MAP_ADD(tm, nts); } else if (left_over) { ts->ts_end = start; + TRIM_MAP_ADD(tm, ts); } else if (right_over) { ts->ts_start = end; + TRIM_MAP_ADD(tm, ts); } else { avl_remove(&tm->tm_queued_frees, ts); - list_remove(&tm->tm_head, ts); - TRIM_MAP_QDEC(tm); kmem_free(ts, sizeof (*ts)); } } @@ -432,7 +418,8 @@ trim_map_write_done(zio_t *zio) * the first element's time is not greater than time argument */ static trim_seg_t * -trim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time) +trim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time, + boolean_t force) { trim_seg_t *ts; @@ -441,9 +428,7 @@ trim_map_first(trim_map_t *tm, uint64_t ts = list_head(&tm->tm_head); if (ts != NULL && ts->ts_txg <= txgsafe && - (ts->ts_txg <= txg || ts->ts_time <= time || - tm->tm_bytes > trim_vdev_max_bytes || - tm->tm_pending > trim_vdev_max_pending)) + (ts->ts_txg <= txg || ts->ts_time <= time || force)) return (ts); return (NULL); } @@ -454,6 +439,7 @@ trim_map_vdev_commit(spa_t *spa, zio_t * trim_map_t *tm = vd->vdev_trimmap; trim_seg_t *ts; uint64_t size, offset, txgtarget, txgsafe; + int64_t hard, soft; hrtime_t timelimit; ASSERT(vd->vdev_ops->vdev_op_leaf); @@ -474,16 +460,19 @@ trim_map_vdev_commit(spa_t *spa, zio_t * } mutex_enter(&tm->tm_lock); + hard = 0; + if (tm->tm_pending > trim_vdev_max_pending) + hard = (tm->tm_pending - trim_vdev_max_pending) / 4; + soft = P2ROUNDUP(hard + tm->tm_pending / trim_timeout + 1, 64); /* Loop until we have sent all outstanding free's */ - while ((ts = trim_map_first(tm, txgtarget, txgsafe, timelimit)) + while (soft > 0 && + (ts = trim_map_first(tm, txgtarget, txgsafe, timelimit, hard > 0)) != NULL) { - list_remove(&tm->tm_head, ts); + TRIM_MAP_REM(tm, ts); avl_remove(&tm->tm_queued_frees, ts); avl_add(&tm->tm_inflight_frees, ts); size = ts->ts_end - ts->ts_start; offset = ts->ts_start; - TRIM_MAP_SDEC(tm, size); - TRIM_MAP_QDEC(tm); /* * We drop the lock while we call zio_nowait as the IO * scheduler can result in a different IO being run e.g. @@ -493,6 +482,8 @@ trim_map_vdev_commit(spa_t *spa, zio_t * zio_nowait(zio_trim(zio, spa, vd, offset, size)); + soft -= TRIM_MAP_SEGS(size); + hard -= TRIM_MAP_SEGS(size); mutex_enter(&tm->tm_lock); } mutex_exit(&tm->tm_lock);