From owner-svn-src-vendor@FreeBSD.ORG Mon Jul 21 22:08:51 2014 Return-Path: Delivered-To: svn-src-vendor@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) (using TLSv1 with cipher ADH-AES256-SHA (256/256 bits)) (No client certificate requested) by hub.freebsd.org (Postfix) with ESMTPS id 354E151C; Mon, 21 Jul 2014 22:08:51 +0000 (UTC) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id 227FB29E3; Mon, 21 Jul 2014 22:08:51 +0000 (UTC) Received: from svn.freebsd.org ([127.0.1.70]) by svn.freebsd.org (8.14.8/8.14.8) with ESMTP id s6LM8pNl072374; Mon, 21 Jul 2014 22:08:51 GMT (envelope-from peter@svn.freebsd.org) Received: (from peter@localhost) by svn.freebsd.org (8.14.8/8.14.8/Submit) id s6LM8oIp072366; Mon, 21 Jul 2014 22:08:50 GMT (envelope-from peter@svn.freebsd.org) Message-Id: <201407212208.s6LM8oIp072366@svn.freebsd.org> From: Peter Wemm Date: Mon, 21 Jul 2014 22:08:50 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org Subject: svn commit: r268958 - in vendor/serf/dist: . auth X-SVN-Group: vendor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-vendor@freebsd.org X-Mailman-Version: 2.1.18 Precedence: list List-Id: SVN commit messages for the vendor work area tree List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 21 Jul 2014 22:08:51 -0000 Author: peter Date: Mon Jul 21 22:08:49 2014 New Revision: 268958 URL: http://svnweb.freebsd.org/changeset/base/268958 Log: Vendor import serf-1.3.6 + Revert r2319 from serf 1.3.5: this change was making serf call handle_response + multiple times in case of an error response, leading to unexpected behavior. Modified: vendor/serf/dist/CHANGES vendor/serf/dist/auth/auth.c vendor/serf/dist/outgoing.c vendor/serf/dist/serf.h Modified: vendor/serf/dist/CHANGES ============================================================================== --- vendor/serf/dist/CHANGES Mon Jul 21 21:26:10 2014 (r268957) +++ vendor/serf/dist/CHANGES Mon Jul 21 22:08:49 2014 (r268958) @@ -1,3 +1,7 @@ +Serf 1.3.6 [2014-06-09, from /tags/1.3.6, rxxxx] + Revert r2319 from serf 1.3.5: this change was making serf call handle_response + multiple times in case of an error response, leading to unexpected behavior. + Serf 1.3.5 [2014-04-27, from /tags/1.3.5, rxxxx] Fix issue #125: no reverse lookup during Negotiate authentication for proxies. Fix a crash caused by incorrect reuse of the ssltunnel CONNECT request (r2316) Modified: vendor/serf/dist/auth/auth.c ============================================================================== --- vendor/serf/dist/auth/auth.c Mon Jul 21 21:26:10 2014 (r268957) +++ vendor/serf/dist/auth/auth.c Mon Jul 21 22:08:49 2014 (r268958) @@ -408,7 +408,6 @@ apr_status_t serf__handle_auth_response( consider the reponse body as invalid and discard it. */ status = discard_body(response); *consumed_response = 1; - if (!APR_STATUS_IS_EOF(status)) { return status; } Modified: vendor/serf/dist/outgoing.c ============================================================================== --- vendor/serf/dist/outgoing.c Mon Jul 21 21:26:10 2014 (r268957) +++ vendor/serf/dist/outgoing.c Mon Jul 21 22:08:49 2014 (r268958) @@ -916,22 +916,21 @@ static apr_status_t handle_response(serf * themselves by not registering credential callbacks. */ if (request->conn->ctx->cred_cb) { - status = serf__handle_auth_response(&consumed_response, - request, - request->resp_bkt, - request->handler_baton, - pool); - - if (SERF_BUCKET_READ_ERROR(status)) { - /* Report the request as 'died'/'cancelled' to the application */ - (void)(*request->handler)(request, - NULL, - request->handler_baton, - pool); - } - - if (status) - return status; + status = serf__handle_auth_response(&consumed_response, + request, + request->resp_bkt, + request->handler_baton, + pool); + + /* If there was an error reading the response (maybe there wasn't + enough data available), don't bother passing the response to the + application. + + If the authentication was tried, but failed, pass the response + to the application, maybe it can do better. */ + if (status) { + return status; + } } if (!consumed_response) { Modified: vendor/serf/dist/serf.h ============================================================================== --- vendor/serf/dist/serf.h Mon Jul 21 21:26:10 2014 (r268957) +++ vendor/serf/dist/serf.h Mon Jul 21 22:08:49 2014 (r268958) @@ -1062,7 +1062,7 @@ void serf_debug__bucket_alloc_check( /* Version info */ #define SERF_MAJOR_VERSION 1 #define SERF_MINOR_VERSION 3 -#define SERF_PATCH_VERSION 5 +#define SERF_PATCH_VERSION 6 /* Version number string */ #define SERF_VERSION_STRING APR_STRINGIFY(SERF_MAJOR_VERSION) "." \ From owner-svn-src-vendor@FreeBSD.ORG Mon Jul 21 22:09:17 2014 Return-Path: Delivered-To: svn-src-vendor@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) (using TLSv1 with cipher ADH-AES256-SHA (256/256 bits)) (No client certificate requested) by hub.freebsd.org (Postfix) with ESMTPS id 2287F64B; Mon, 21 Jul 2014 22:09:17 +0000 (UTC) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id EAB3E29EB; Mon, 21 Jul 2014 22:09:16 +0000 (UTC) Received: from svn.freebsd.org ([127.0.1.70]) by svn.freebsd.org (8.14.8/8.14.8) with ESMTP id s6LM9Gam072510; Mon, 21 Jul 2014 22:09:16 GMT (envelope-from peter@svn.freebsd.org) Received: (from peter@localhost) by svn.freebsd.org (8.14.8/8.14.8/Submit) id s6LM9GDq072509; Mon, 21 Jul 2014 22:09:16 GMT (envelope-from peter@svn.freebsd.org) Message-Id: <201407212209.s6LM9GDq072509@svn.freebsd.org> From: Peter Wemm Date: Mon, 21 Jul 2014 22:09:16 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org Subject: svn commit: r268959 - vendor/serf/serf-1.3.6 X-SVN-Group: vendor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-vendor@freebsd.org X-Mailman-Version: 2.1.18 Precedence: list List-Id: SVN commit messages for the vendor work area tree List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 21 Jul 2014 22:09:17 -0000 Author: peter Date: Mon Jul 21 22:09:16 2014 New Revision: 268959 URL: http://svnweb.freebsd.org/changeset/base/268959 Log: Tag serf-1.3.6 Added: vendor/serf/serf-1.3.6/ - copied from r268958, vendor/serf/dist/ From owner-svn-src-vendor@FreeBSD.ORG Wed Jul 23 08:00:36 2014 Return-Path: Delivered-To: svn-src-vendor@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [8.8.178.115]) (using TLSv1 with cipher ADH-AES256-SHA (256/256 bits)) (No client certificate requested) by hub.freebsd.org (Postfix) with ESMTPS id 263EB853; Wed, 23 Jul 2014 08:00:36 +0000 (UTC) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id 10AF52D4E; Wed, 23 Jul 2014 08:00:36 +0000 (UTC) Received: from svn.freebsd.org ([127.0.1.70]) by svn.freebsd.org (8.14.8/8.14.8) with ESMTP id s6N80aqn004631; Wed, 23 Jul 2014 08:00:36 GMT (envelope-from delphij@svn.freebsd.org) Received: (from delphij@localhost) by svn.freebsd.org (8.14.8/8.14.8/Submit) id s6N80ZxU004625; Wed, 23 Jul 2014 08:00:35 GMT (envelope-from delphij@svn.freebsd.org) Message-Id: <201407230800.s6N80ZxU004625@svn.freebsd.org> From: Xin LI Date: Wed, 23 Jul 2014 08:00:35 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org Subject: svn commit: r269010 - vendor-sys/illumos/dist/common/zfs vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/sys vendor-sys/illumos/dist/uts/common/sys/fs vendor/ill... X-SVN-Group: vendor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-vendor@freebsd.org X-Mailman-Version: 2.1.18 Precedence: list List-Id: SVN commit messages for the vendor work area tree List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 23 Jul 2014 08:00:36 -0000 Author: delphij Date: Wed Jul 23 08:00:34 2014 New Revision: 269010 URL: http://svnweb.freebsd.org/changeset/base/269010 Log: 4976 zfs should only avoid writing to a failing non-redundant top-level vdev 4977 mdb error in ::spa_space from space_cb() if a metaslab's ms_sm is NULL 4978 ztest fails in get_metaslab_refcount() 4979 extend free space histogram to device and pool 4980 metaslabs should have a fragmentation metric 4981 remove fragmented ops vector from block allocator 4982 space_map object should proactively upgrade when feature is enabled 4983 need to collect metaslab information via mdb 4984 device selection should use fragmentation metric Reviewed by: Matthew Ahrens Reviewed by: Adam Leventhal Reviewed by: Christopher Siden Approved by: Garrett D'Amore illumos/illumos-gate@2e4c998613148111f2fc5371085331ffb39122ff Modified: vendor/illumos/dist/cmd/zdb/zdb.c vendor/illumos/dist/cmd/zpool/zpool_main.c vendor/illumos/dist/lib/libzfs/common/libzfs_pool.c vendor/illumos/dist/man/man1m/zdb.1m vendor/illumos/dist/man/man1m/zpool.1m Changes in other areas also in this revision: Modified: vendor-sys/illumos/dist/common/zfs/zpool_prop.c vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c vendor-sys/illumos/dist/uts/common/fs/zfs/space_map.c vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab_impl.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/space_map.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_debug.h vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c vendor-sys/illumos/dist/uts/common/sys/fs/zfs.h Modified: vendor/illumos/dist/cmd/zdb/zdb.c ============================================================================== --- vendor/illumos/dist/cmd/zdb/zdb.c Wed Jul 23 05:40:28 2014 (r269009) +++ vendor/illumos/dist/cmd/zdb/zdb.c Wed Jul 23 08:00:34 2014 (r269010) @@ -111,11 +111,11 @@ static void usage(void) { (void) fprintf(stderr, - "Usage: %s [-CumdibcsDvhLXFPA] [-t txg] [-e [-p path...]] " - "[-U config] [-M inflight I/Os] [-x dumpdir] poolname [object...]\n" + "Usage: %s [-CumMdibcsDvhLXFPA] [-t txg] [-e [-p path...]] " + "[-U config] [-I inflight I/Os] [-x dumpdir] poolname [object...]\n" " %s [-divPA] [-e -p path...] [-U config] dataset " "[object...]\n" - " %s -m [-LXFPA] [-t txg] [-e [-p path...]] [-U config] " + " %s -mM [-LXFPA] [-t txg] [-e [-p path...]] [-U config] " "poolname [vdev [metaslab...]]\n" " %s -R [-A] [-e [-p path...]] poolname " "vdev:offset:size[:flags]\n" @@ -138,6 +138,7 @@ usage(void) (void) fprintf(stderr, " -h pool history\n"); (void) fprintf(stderr, " -b block statistics\n"); (void) fprintf(stderr, " -m metaslabs\n"); + (void) fprintf(stderr, " -M metaslab groups\n"); (void) fprintf(stderr, " -c checksum all metadata (twice for " "all data) blocks\n"); (void) fprintf(stderr, " -s report stats on zdb's I/O\n"); @@ -168,7 +169,7 @@ usage(void) (void) fprintf(stderr, " -P print numbers in parseable form\n"); (void) fprintf(stderr, " -t -- highest txg to use when " "searching for uberblocks\n"); - (void) fprintf(stderr, " -M -- " + (void) fprintf(stderr, " -I -- " "specify the maximum number of " "checksumming I/Os [default is 200]\n"); (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " @@ -548,7 +549,7 @@ get_metaslab_refcount(vdev_t *vd) { int refcount = 0; - if (vd->vdev_top == vd) { + if (vd->vdev_top == vd && !vd->vdev_removing) { for (int m = 0; m < vd->vdev_ms_count; m++) { space_map_t *sm = vd->vdev_ms[m]->ms_sm; @@ -686,9 +687,10 @@ dump_metaslab(metaslab_t *msp) * The space map histogram represents free space in chunks * of sm_shift (i.e. bucket 0 refers to 2^sm_shift). */ - (void) printf("\tOn-disk histogram:\n"); + (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n", + (u_longlong_t)msp->ms_fragmentation); dump_histogram(sm->sm_phys->smp_histogram, - SPACE_MAP_HISTOGRAM_SIZE(sm), sm->sm_shift); + SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); } if (dump_opt['d'] > 5 || dump_opt['m'] > 3) { @@ -713,6 +715,47 @@ print_vdev_metaslab_header(vdev_t *vd) } static void +dump_metaslab_groups(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + metaslab_class_t *mc = spa_normal_class(spa); + uint64_t fragmentation; + + metaslab_class_histogram_verify(mc); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (mg->mg_class != mc) + continue; + + metaslab_group_histogram_verify(mg); + mg->mg_fragmentation = metaslab_group_fragmentation(mg); + + (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t" + "fragmentation", + (u_longlong_t)tvd->vdev_id, + (u_longlong_t)tvd->vdev_ms_count); + if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { + (void) printf("%3s\n", "-"); + } else { + (void) printf("%3llu%%\n", + (u_longlong_t)mg->mg_fragmentation); + } + dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); + } + + (void) printf("\tpool %s\tfragmentation", spa_name(spa)); + fragmentation = metaslab_class_fragmentation(mc); + if (fragmentation == ZFS_FRAG_INVALID) + (void) printf("\t%3s\n", "-"); + else + (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation); + dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); +} + +static void dump_metaslabs(spa_t *spa) { vdev_t *vd, *rvd = spa->spa_root_vdev; @@ -2340,8 +2383,7 @@ zdb_leak(void *arg, uint64_t start, uint } static metaslab_ops_t zdb_metaslab_ops = { - NULL, /* alloc */ - NULL /* fragmented */ + NULL /* alloc */ }; static void @@ -2836,6 +2878,8 @@ dump_zpool(spa_t *spa) if (dump_opt['d'] > 2 || dump_opt['m']) dump_metaslabs(spa); + if (dump_opt['M']) + dump_metaslab_groups(spa); if (dump_opt['d'] || dump_opt['i']) { dump_dir(dp->dp_meta_objset); @@ -3330,7 +3374,7 @@ main(int argc, char **argv) dprintf_setup(&argc, argv); while ((c = getopt(argc, argv, - "bcdhilmM:suCDRSAFLXx:evp:t:U:P")) != -1) { + "bcdhilmMI:suCDRSAFLXx:evp:t:U:P")) != -1) { switch (c) { case 'b': case 'c': @@ -3343,6 +3387,7 @@ main(int argc, char **argv) case 'u': case 'C': case 'D': + case 'M': case 'R': case 'S': dump_opt[c]++; @@ -3356,10 +3401,7 @@ main(int argc, char **argv) case 'P': dump_opt[c]++; break; - case 'v': - verbose++; - break; - case 'M': + case 'I': max_inflight = strtoull(optarg, NULL, 0); if (max_inflight == 0) { (void) fprintf(stderr, "maximum number " @@ -3383,9 +3425,6 @@ main(int argc, char **argv) } searchdirs[nsearch++] = optarg; break; - case 'x': - vn_dumpdir = optarg; - break; case 't': max_txg = strtoull(optarg, NULL, 0); if (max_txg < TXG_INITIAL) { @@ -3397,6 +3436,12 @@ main(int argc, char **argv) case 'U': spa_config_path = optarg; break; + case 'v': + verbose++; + break; + case 'x': + vn_dumpdir = optarg; + break; default: usage(); break; Modified: vendor/illumos/dist/cmd/zpool/zpool_main.c ============================================================================== --- vendor/illumos/dist/cmd/zpool/zpool_main.c Wed Jul 23 05:40:28 2014 (r269009) +++ vendor/illumos/dist/cmd/zpool/zpool_main.c Wed Jul 23 08:00:34 2014 (r269010) @@ -2754,10 +2754,15 @@ print_one_column(zpool_prop_t prop, uint boolean_t fixed; size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL); - zfs_nicenum(value, propval, sizeof (propval)); if (prop == ZPOOL_PROP_EXPANDSZ && value == 0) (void) strlcpy(propval, "-", sizeof (propval)); + else if (prop == ZPOOL_PROP_FRAGMENTATION && value == ZFS_FRAG_INVALID) + (void) strlcpy(propval, "-", sizeof (propval)); + else if (prop == ZPOOL_PROP_FRAGMENTATION) + (void) snprintf(propval, sizeof (propval), "%llu%%", value); + else + zfs_nicenum(value, propval, sizeof (propval)); if (scripted) (void) printf("\t%s", propval); @@ -2790,9 +2795,9 @@ print_list_stats(zpool_handle_t *zhp, co /* only toplevel vdevs have capacity stats */ if (vs->vs_space == 0) { if (scripted) - (void) printf("\t-\t-\t-"); + (void) printf("\t-\t-\t-\t-"); else - (void) printf(" - - -"); + (void) printf(" - - - -"); } else { print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, scripted); @@ -2800,6 +2805,8 @@ print_list_stats(zpool_handle_t *zhp, co scripted); print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc, scripted); + print_one_column(ZPOOL_PROP_FRAGMENTATION, + vs->vs_fragmentation, scripted); } print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, scripted); @@ -2885,8 +2892,8 @@ zpool_do_list(int argc, char **argv) int ret; list_cbdata_t cb = { 0 }; static char default_props[] = - "name,size,allocated,free,expandsize,capacity,dedupratio," - "health,altroot"; + "name,size,allocated,free,fragmentation,expandsize,capacity," + "dedupratio,health,altroot"; char *props = default_props; unsigned long interval = 0, count = 0; zpool_list_t *list; Modified: vendor/illumos/dist/lib/libzfs/common/libzfs_pool.c ============================================================================== --- vendor/illumos/dist/lib/libzfs/common/libzfs_pool.c Wed Jul 23 05:40:28 2014 (r269009) +++ vendor/illumos/dist/lib/libzfs/common/libzfs_pool.c Wed Jul 23 08:00:34 2014 (r269010) @@ -294,6 +294,14 @@ zpool_get_prop(zpool_handle_t *zhp, zpoo (u_longlong_t)intval); } break; + case ZPOOL_PROP_FRAGMENTATION: + if (intval == UINT64_MAX) { + (void) strlcpy(buf, "-", len); + } else { + (void) snprintf(buf, len, "%llu%%", + (u_longlong_t)intval); + } + break; case ZPOOL_PROP_DEDUPRATIO: (void) snprintf(buf, len, "%llu.%02llux", Modified: vendor/illumos/dist/man/man1m/zdb.1m ============================================================================== --- vendor/illumos/dist/man/man1m/zdb.1m Wed Jul 23 05:40:28 2014 (r269009) +++ vendor/illumos/dist/man/man1m/zdb.1m Wed Jul 23 08:00:34 2014 (r269010) @@ -19,8 +19,8 @@ \fBzdb\fR - Display zpool debugging and consistency information .SH "SYNOPSIS" -\fBzdb\fR [-CumdibcsDvhLXFPA] [-e [-p \fIpath\fR...]] [-t \fItxg\fR] - [-U \fIcache\fR] [-M \fIinflight I/Os\fR] [-x \fIdumpdir\fR] +\fBzdb\fR [-CumdibcsDvhLMXFPA] [-e [-p \fIpath\fR...]] [-t \fItxg\fR] + [-U \fIcache\fR] [-I \fIinflight I/Os\fR] [-x \fIdumpdir\fR] [\fIpoolname\fR [\fIobject\fR ...]] .P @@ -28,7 +28,7 @@ \fIdataset\fR [\fIobject\fR ...] .P -\fBzdb\fR -m [-LXFPA] [-t \fItxg\fR] [-e [-p \fIpath\fR...]] [-U \fIcache\fR] +\fBzdb\fR -m [-MLXFPA] [-t \fItxg\fR] [-e [-p \fIpath\fR...]] [-U \fIcache\fR] \fIpoolname\fR [\fIvdev\fR [\fImetaslab\fR ...]] .P @@ -194,6 +194,21 @@ verifies that all non-free blocks are re .sp .6 .RS 4n Display the offset, spacemap, and free space of each metaslab. +When specified twice, also display information about the on-disk free +space histogram associated with each metaslab. When specified three time, +display the maximum contiguous free space, the in-core free space histogram, +and the percentage of free space in each space map. When specified +four times display every spacemap record. +.RE + +.sp +.ne 2 +.na +\fB-M\fR +.ad +.sp .6 +.RS 4n +Display the offset, spacemap, and free space of each metaslab. When specified twice, also display information about the maximum contiguous free space and the percentage of free space in each space map. When specified three times display every spacemap record. @@ -380,7 +395,7 @@ transactions. .sp .ne 2 .na -\fB-M \fIinflight I/Os\fR \fR +\fB-I \fIinflight I/Os\fR \fR .ad .sp .6 .RS 4n Modified: vendor/illumos/dist/man/man1m/zpool.1m ============================================================================== --- vendor/illumos/dist/man/man1m/zpool.1m Wed Jul 23 05:40:28 2014 (r269009) +++ vendor/illumos/dist/man/man1m/zpool.1m Wed Jul 23 08:00:34 2014 (r269010) @@ -1,7 +1,7 @@ '\" te .\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. .\" Copyright 2011, Nexenta Systems, Inc. All Rights Reserved. -.\" Copyright (c) 2012 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Delphix. All rights reserved. .\" The contents of this file are subject to the terms of the Common Development .\" and Distribution License (the "License"). You may not use this file except .\" in compliance with the License. You can obtain a copy of the license at @@ -573,6 +573,15 @@ any space on an EFI labeled vdev which h .sp .ne 2 .na +\fB\fBfragmentation\fR\fR +.ad +.RS 20n +The amount of fragmentation in the pool. +.RE + +.sp +.ne 2 +.na \fB\fBfree\fR\fR .ad .RS 20n @@ -1648,7 +1657,7 @@ Display numbers in parsable (exact) valu .RS 12n Comma-separated list of properties to display. See the "Properties" section for a list of valid properties. The default list is "name, size, used, available, -expandsize, capacity, dedupratio, health, altroot" +fragmentation, expandsize, capacity, dedupratio, health, altroot" .RE .sp @@ -2035,10 +2044,10 @@ The results from this command are simila .in +2 .nf # \fBzpool list\fR - NAME SIZE ALLOC FREE EXPANDSZ CAP DEDUP HEALTH ALTROOT - rpool 19.9G 8.43G 11.4G - 42% 1.00x ONLINE - - tank 61.5G 20.0G 41.5G - 32% 1.00x ONLINE - - zion - - - - - - FAULTED - +NAME SIZE ALLOC FREE FRAG EXPANDSZ CAP DEDUP HEALTH ALTROOT +rpool 19.9G 8.43G 11.4G 33% - 42% 1.00x ONLINE - +tank 61.5G 20.0G 41.5G 48% - 32% 1.00x ONLINE - +zion - - - - - - - FAULTED - .fi .in -2 .sp @@ -2259,7 +2268,7 @@ The command to remove the mirrored log \ .LP The following command dipslays the detailed information for the \fIdata\fR pool. This pool is comprised of a single \fIraidz\fR vdev where one of its -devices increased its capacity by 1GB. In this example, the pool will not +devices increased its capacity by 10GB. In this example, the pool will not be able to utilized this extra capacity until all the devices under the \fIraidz\fR vdev have been expanded. @@ -2267,12 +2276,12 @@ be able to utilized this extra capacity .in +2 .nf # \fBzpool list -v data\fR - NAME SIZE ALLOC FREE EXPANDSZ CAP DEDUP HEALTH ALTROOT - data 17.9G 174K 17.9G - 0% 1.00x ONLINE - - raidz1 17.9G 174K 17.9G - - c4t2d0 - - - 1G - c4t3d0 - - - - - c4t4d0 - - - - +NAME SIZE ALLOC FREE FRAG EXPANDSZ CAP DEDUP HEALTH ALTROOT +data 23.9G 14.6G 9.30G 48% - 61% 1.00x ONLINE - + raidz1 23.9G 14.6G 9.30G 48% - + c1t1d0 - - - - - + c1t2d0 - - - - 10G + c1t3d0 - - - - - .fi .in -2 From owner-svn-src-vendor@FreeBSD.ORG Wed Jul 23 08:00:37 2014 Return-Path: Delivered-To: svn-src-vendor@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [8.8.178.115]) (using TLSv1 with cipher ADH-AES256-SHA (256/256 bits)) (No client certificate requested) by hub.freebsd.org (Postfix) with ESMTPS id 739A5856; Wed, 23 Jul 2014 08:00:37 +0000 (UTC) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id 5DF1B2D50; Wed, 23 Jul 2014 08:00:37 +0000 (UTC) Received: from svn.freebsd.org ([127.0.1.70]) by svn.freebsd.org (8.14.8/8.14.8) with ESMTP id s6N80brd004645; Wed, 23 Jul 2014 08:00:37 GMT (envelope-from delphij@svn.freebsd.org) Received: (from delphij@localhost) by svn.freebsd.org (8.14.8/8.14.8/Submit) id s6N80aFJ004637; Wed, 23 Jul 2014 08:00:36 GMT (envelope-from delphij@svn.freebsd.org) Message-Id: <201407230800.s6N80aFJ004637@svn.freebsd.org> From: Xin LI Date: Wed, 23 Jul 2014 08:00:36 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org Subject: svn commit: r269010 - vendor-sys/illumos/dist/common/zfs vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/sys vendor-sys/illumos/dist/uts/common/sys/fs vendor/ill... X-SVN-Group: vendor-sys MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-vendor@freebsd.org X-Mailman-Version: 2.1.18 Precedence: list List-Id: SVN commit messages for the vendor work area tree List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 23 Jul 2014 08:00:37 -0000 Author: delphij Date: Wed Jul 23 08:00:34 2014 New Revision: 269010 URL: http://svnweb.freebsd.org/changeset/base/269010 Log: 4976 zfs should only avoid writing to a failing non-redundant top-level vdev 4977 mdb error in ::spa_space from space_cb() if a metaslab's ms_sm is NULL 4978 ztest fails in get_metaslab_refcount() 4979 extend free space histogram to device and pool 4980 metaslabs should have a fragmentation metric 4981 remove fragmented ops vector from block allocator 4982 space_map object should proactively upgrade when feature is enabled 4983 need to collect metaslab information via mdb 4984 device selection should use fragmentation metric Reviewed by: Matthew Ahrens Reviewed by: Adam Leventhal Reviewed by: Christopher Siden Approved by: Garrett D'Amore illumos/illumos-gate@2e4c998613148111f2fc5371085331ffb39122ff Modified: vendor-sys/illumos/dist/common/zfs/zpool_prop.c vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c vendor-sys/illumos/dist/uts/common/fs/zfs/space_map.c vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab_impl.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/space_map.h vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zfs_debug.h vendor-sys/illumos/dist/uts/common/fs/zfs/vdev.c vendor-sys/illumos/dist/uts/common/sys/fs/zfs.h Changes in other areas also in this revision: Modified: vendor/illumos/dist/cmd/zdb/zdb.c vendor/illumos/dist/cmd/zpool/zpool_main.c vendor/illumos/dist/lib/libzfs/common/libzfs_pool.c vendor/illumos/dist/man/man1m/zdb.1m vendor/illumos/dist/man/man1m/zpool.1m Modified: vendor-sys/illumos/dist/common/zfs/zpool_prop.c ============================================================================== --- vendor-sys/illumos/dist/common/zfs/zpool_prop.c Wed Jul 23 05:40:28 2014 (r269009) +++ vendor-sys/illumos/dist/common/zfs/zpool_prop.c Wed Jul 23 08:00:34 2014 (r269010) @@ -21,7 +21,7 @@ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #include @@ -87,6 +87,8 @@ zpool_prop_init(void) PROP_READONLY, ZFS_TYPE_POOL, "", "ALLOC"); zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "EXPANDSZ"); + zprop_register_number(ZPOOL_PROP_FRAGMENTATION, "fragmentation", 0, + PROP_READONLY, ZFS_TYPE_POOL, "", "FRAG"); zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "CAP"); zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY, Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c Wed Jul 23 05:40:28 2014 (r269009) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/metaslab.c Wed Jul 23 08:00:34 2014 (r269010) @@ -32,6 +32,7 @@ #include #include #include +#include /* * Allow allocations to switch to gang blocks quickly. We do this to @@ -79,7 +80,7 @@ int zfs_metaslab_condense_block_threshol /* * The zfs_mg_noalloc_threshold defines which metaslab groups should * be eligible for allocation. The value is defined as a percentage of - * a free space. Metaslab groups that have more free space than + * free space. Metaslab groups that have more free space than * zfs_mg_noalloc_threshold are always eligible for allocations. Once * a metaslab group's free space is less than or equal to the * zfs_mg_noalloc_threshold the allocator will avoid allocating to that @@ -92,6 +93,23 @@ int zfs_metaslab_condense_block_threshol int zfs_mg_noalloc_threshold = 0; /* + * Metaslab groups are considered eligible for allocations if their + * fragmenation metric (measured as a percentage) is less than or equal to + * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold + * then it will be skipped unless all metaslab groups within the metaslab + * class have also crossed this threshold. + */ +int zfs_mg_fragmentation_threshold = 85; + +/* + * Allow metaslabs to keep their active state as long as their fragmentation + * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An + * active metaslab that exceeds this threshold will no longer keep its active + * status allowing better metaslabs to be selected. + */ +int zfs_metaslab_fragmentation_threshold = 70; + +/* * When set will load all metaslabs when pool is first opened. */ int metaslab_debug_load = 0; @@ -136,11 +154,6 @@ int metaslab_load_pct = 50; int metaslab_unload_delay = TXG_SIZE * 2; /* - * Should we be willing to write data to degraded vdevs? - */ -boolean_t zfs_write_to_degraded = B_FALSE; - -/* * Max number of metaslabs per group to preload. */ int metaslab_preload_limit = SPA_DVAS_PER_BP; @@ -151,10 +164,21 @@ int metaslab_preload_limit = SPA_DVAS_PE boolean_t metaslab_preload_enabled = B_TRUE; /* - * Enable/disable additional weight factor for each metaslab. + * Enable/disable fragmentation weighting on metaslabs. + */ +boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; + +/* + * Enable/disable lba weighting (i.e. outer tracks are given preference). + */ +boolean_t metaslab_lba_weighting_enabled = B_TRUE; + +/* + * Enable/disable metaslab group biasing. */ -boolean_t metaslab_weight_factor_enable = B_FALSE; +boolean_t metaslab_bias_enabled = B_TRUE; +static uint64_t metaslab_fragmentation(metaslab_t *); /* * ========================================================================== @@ -247,6 +271,121 @@ metaslab_class_get_dspace(metaslab_class return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); } +void +metaslab_class_histogram_verify(metaslab_class_t *mc) +{ + vdev_t *rvd = mc->mc_spa->spa_root_vdev; + uint64_t *mc_hist; + int i; + + if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) + return; + + mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, + KM_SLEEP); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + /* + * Skip any holes, uninitialized top-levels, or + * vdevs that are not in this metalab class. + */ + if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || + mg->mg_class != mc) { + continue; + } + + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + mc_hist[i] += mg->mg_histogram[i]; + } + + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); + + kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); +} + +/* + * Calculate the metaslab class's fragmentation metric. The metric + * is weighted based on the space contribution of each metaslab group. + * The return value will be a number between 0 and 100 (inclusive), or + * ZFS_FRAG_INVALID if the metric has not been set. See comment above the + * zfs_frag_table for more information about the metric. + */ +uint64_t +metaslab_class_fragmentation(metaslab_class_t *mc) +{ + vdev_t *rvd = mc->mc_spa->spa_root_vdev; + uint64_t fragmentation = 0; + + spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + /* + * Skip any holes, uninitialized top-levels, or + * vdevs that are not in this metalab class. + */ + if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || + mg->mg_class != mc) { + continue; + } + + /* + * If a metaslab group does not contain a fragmentation + * metric then just bail out. + */ + if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { + spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); + return (ZFS_FRAG_INVALID); + } + + /* + * Determine how much this metaslab_group is contributing + * to the overall pool fragmentation metric. + */ + fragmentation += mg->mg_fragmentation * + metaslab_group_get_space(mg); + } + fragmentation /= metaslab_class_get_space(mc); + + ASSERT3U(fragmentation, <=, 100); + spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); + return (fragmentation); +} + +/* + * Calculate the amount of expandable space that is available in + * this metaslab class. If a device is expanded then its expandable + * space will be the amount of allocatable space that is currently not + * part of this metaslab class. + */ +uint64_t +metaslab_class_expandable_space(metaslab_class_t *mc) +{ + vdev_t *rvd = mc->mc_spa->spa_root_vdev; + uint64_t space = 0; + + spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || + mg->mg_class != mc) { + continue; + } + + space += tvd->vdev_max_asize - tvd->vdev_asize; + } + spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); + return (space); +} + /* * ========================================================================== * Metaslab groups @@ -299,7 +438,15 @@ metaslab_group_alloc_update(metaslab_gro mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / (vs->vs_space + 1); - mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold); + /* + * A metaslab group is considered allocatable if it has plenty + * of free space or is not heavily fragmented. We only take + * fragmentation into account if the metaslab group has a valid + * fragmentation metric (i.e. a value between 0 and 100). + */ + mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold && + (mg->mg_fragmentation == ZFS_FRAG_INVALID || + mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); /* * The mc_alloc_groups maintains a count of the number of @@ -320,6 +467,7 @@ metaslab_group_alloc_update(metaslab_gro mc->mc_alloc_groups--; else if (!was_allocatable && mg->mg_allocatable) mc->mc_alloc_groups++; + mutex_exit(&mg->mg_lock); } @@ -409,6 +557,7 @@ metaslab_group_passivate(metaslab_group_ } taskq_wait(mg->mg_taskq); + metaslab_group_alloc_update(mg); mgprev = mg->mg_prev; mgnext = mg->mg_next; @@ -425,20 +574,113 @@ metaslab_group_passivate(metaslab_group_ mg->mg_next = NULL; } +uint64_t +metaslab_group_get_space(metaslab_group_t *mg) +{ + return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); +} + +void +metaslab_group_histogram_verify(metaslab_group_t *mg) +{ + uint64_t *mg_hist; + vdev_t *vd = mg->mg_vd; + uint64_t ashift = vd->vdev_ashift; + int i; + + if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) + return; + + mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, + KM_SLEEP); + + ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, + SPACE_MAP_HISTOGRAM_SIZE + ashift); + + for (int m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + if (msp->ms_sm == NULL) + continue; + + for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) + mg_hist[i + ashift] += + msp->ms_sm->sm_phys->smp_histogram[i]; + } + + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) + VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); + + kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); +} + static void -metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) +metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) { + metaslab_class_t *mc = mg->mg_class; + uint64_t ashift = mg->mg_vd->vdev_ashift; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + if (msp->ms_sm == NULL) + return; + mutex_enter(&mg->mg_lock); + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + mg->mg_histogram[i + ashift] += + msp->ms_sm->sm_phys->smp_histogram[i]; + mc->mc_histogram[i + ashift] += + msp->ms_sm->sm_phys->smp_histogram[i]; + } + mutex_exit(&mg->mg_lock); +} + +void +metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) +{ + metaslab_class_t *mc = mg->mg_class; + uint64_t ashift = mg->mg_vd->vdev_ashift; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + if (msp->ms_sm == NULL) + return; + + mutex_enter(&mg->mg_lock); + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + ASSERT3U(mg->mg_histogram[i + ashift], >=, + msp->ms_sm->sm_phys->smp_histogram[i]); + ASSERT3U(mc->mc_histogram[i + ashift], >=, + msp->ms_sm->sm_phys->smp_histogram[i]); + + mg->mg_histogram[i + ashift] -= + msp->ms_sm->sm_phys->smp_histogram[i]; + mc->mc_histogram[i + ashift] -= + msp->ms_sm->sm_phys->smp_histogram[i]; + } + mutex_exit(&mg->mg_lock); +} + +static void +metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) +{ ASSERT(msp->ms_group == NULL); + mutex_enter(&mg->mg_lock); msp->ms_group = mg; msp->ms_weight = 0; avl_add(&mg->mg_metaslab_tree, msp); mutex_exit(&mg->mg_lock); + + mutex_enter(&msp->ms_lock); + metaslab_group_histogram_add(mg, msp); + mutex_exit(&msp->ms_lock); } static void metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) { + mutex_enter(&msp->ms_lock); + metaslab_group_histogram_remove(mg, msp); + mutex_exit(&msp->ms_lock); + mutex_enter(&mg->mg_lock); ASSERT(msp->ms_group == mg); avl_remove(&mg->mg_metaslab_tree, msp); @@ -451,9 +693,9 @@ metaslab_group_sort(metaslab_group_t *mg { /* * Although in principle the weight can be any value, in - * practice we do not use values in the range [1, 510]. + * practice we do not use values in the range [1, 511]. */ - ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); + ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); ASSERT(MUTEX_HELD(&msp->ms_lock)); mutex_enter(&mg->mg_lock); @@ -465,9 +707,42 @@ metaslab_group_sort(metaslab_group_t *mg } /* + * Calculate the fragmentation for a given metaslab group. We can use + * a simple average here since all metaslabs within the group must have + * the same size. The return value will be a value between 0 and 100 + * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this + * group have a fragmentation metric. + */ +uint64_t +metaslab_group_fragmentation(metaslab_group_t *mg) +{ + vdev_t *vd = mg->mg_vd; + uint64_t fragmentation = 0; + uint64_t valid_ms = 0; + + for (int m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + if (msp->ms_fragmentation == ZFS_FRAG_INVALID) + continue; + + valid_ms++; + fragmentation += msp->ms_fragmentation; + } + + if (valid_ms <= vd->vdev_ms_count / 2) + return (ZFS_FRAG_INVALID); + + fragmentation /= valid_ms; + ASSERT3U(fragmentation, <=, 100); + return (fragmentation); +} + +/* * Determine if a given metaslab group should skip allocations. A metaslab - * group should avoid allocations if its used capacity has crossed the - * zfs_mg_noalloc_threshold and there is at least one metaslab group + * group should avoid allocations if its free capacity is less than the + * zfs_mg_noalloc_threshold or its fragmentation metric is greater than + * zfs_mg_fragmentation_threshold and there is at least one metaslab group * that can still handle allocations. */ static boolean_t @@ -478,12 +753,19 @@ metaslab_group_allocatable(metaslab_grou metaslab_class_t *mc = mg->mg_class; /* - * A metaslab group is considered allocatable if its free capacity - * is greater than the set value of zfs_mg_noalloc_threshold, it's - * associated with a slog, or there are no other metaslab groups - * with free capacity greater than zfs_mg_noalloc_threshold. - */ - return (mg->mg_free_capacity > zfs_mg_noalloc_threshold || + * We use two key metrics to determine if a metaslab group is + * considered allocatable -- free space and fragmentation. If + * the free space is greater than the free space threshold and + * the fragmentation is less than the fragmentation threshold then + * consider the group allocatable. There are two case when we will + * not consider these key metrics. The first is if the group is + * associated with a slog device and the second is if all groups + * in this metaslab class have already been consider ineligible + * for allocations. + */ + return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold && + (mg->mg_fragmentation == ZFS_FRAG_INVALID || + mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) || mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); } @@ -707,16 +989,8 @@ metaslab_ff_alloc(metaslab_t *msp, uint6 return (metaslab_block_picker(t, cursor, size, align)); } -/* ARGSUSED */ -static boolean_t -metaslab_ff_fragmented(metaslab_t *msp) -{ - return (B_TRUE); -} - static metaslab_ops_t metaslab_ff_ops = { - metaslab_ff_alloc, - metaslab_ff_fragmented + metaslab_ff_alloc }; /* @@ -763,23 +1037,8 @@ metaslab_df_alloc(metaslab_t *msp, uint6 return (metaslab_block_picker(t, cursor, size, 1ULL)); } -static boolean_t -metaslab_df_fragmented(metaslab_t *msp) -{ - range_tree_t *rt = msp->ms_tree; - uint64_t max_size = metaslab_block_maxsize(msp); - int free_pct = range_tree_space(rt) * 100 / msp->ms_size; - - if (max_size >= metaslab_df_alloc_threshold && - free_pct >= metaslab_df_free_pct) - return (B_FALSE); - - return (B_TRUE); -} - static metaslab_ops_t metaslab_df_ops = { - metaslab_df_alloc, - metaslab_df_fragmented + metaslab_df_alloc }; /* @@ -822,15 +1081,8 @@ metaslab_cf_alloc(metaslab_t *msp, uint6 return (offset); } -static boolean_t -metaslab_cf_fragmented(metaslab_t *msp) -{ - return (metaslab_block_maxsize(msp) < metaslab_min_alloc_size); -} - static metaslab_ops_t metaslab_cf_ops = { - metaslab_cf_alloc, - metaslab_cf_fragmented + metaslab_cf_alloc }; /* @@ -887,16 +1139,8 @@ metaslab_ndf_alloc(metaslab_t *msp, uint return (-1ULL); } -static boolean_t -metaslab_ndf_fragmented(metaslab_t *msp) -{ - return (metaslab_block_maxsize(msp) <= - (metaslab_min_alloc_size << metaslab_ndf_clump_shift)); -} - static metaslab_ops_t metaslab_ndf_ops = { - metaslab_ndf_alloc, - metaslab_ndf_fragmented + metaslab_ndf_alloc }; metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; @@ -998,6 +1242,7 @@ metaslab_init(metaslab_group_t *mg, uint msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock); metaslab_group_add(mg, msp); + msp->ms_fragmentation = metaslab_fragmentation(msp); msp->ms_ops = mg->mg_class->mc_ops; /* @@ -1063,69 +1308,113 @@ metaslab_fini(metaslab_t *msp) kmem_free(msp, sizeof (metaslab_t)); } +#define FRAGMENTATION_TABLE_SIZE 17 + /* - * Apply a weighting factor based on the histogram information for this - * metaslab. The current weighting factor is somewhat arbitrary and requires - * additional investigation. The implementation provides a measure of - * "weighted" free space and gives a higher weighting for larger contiguous - * regions. The weighting factor is determined by counting the number of - * sm_shift sectors that exist in each region represented by the histogram. - * That value is then multiplied by the power of 2 exponent and the sm_shift - * value. + * This table defines a segment size based fragmentation metric that will + * allow each metaslab to derive its own fragmentation value. This is done + * by calculating the space in each bucket of the spacemap histogram and + * multiplying that by the fragmetation metric in this table. Doing + * this for all buckets and dividing it by the total amount of free + * space in this metaslab (i.e. the total free space in all buckets) gives + * us the fragmentation metric. This means that a high fragmentation metric + * equates to most of the free space being comprised of small segments. + * Conversely, if the metric is low, then most of the free space is in + * large segments. A 10% change in fragmentation equates to approximately + * double the number of segments. * - * For example, assume the 2^21 histogram bucket has 4 2MB regions and the - * metaslab has an sm_shift value of 9 (512B): - * - * 1) calculate the number of sm_shift sectors in the region: - * 2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384 - * 2) multiply by the power of 2 exponent and the sm_shift value: - * 16384 * 21 * 9 = 3096576 - * This value will be added to the weighting of the metaslab. + * This table defines 0% fragmented space using 16MB segments. Testing has + * shown that segments that are greater than or equal to 16MB do not suffer + * from drastic performance problems. Using this value, we derive the rest + * of the table. Since the fragmentation value is never stored on disk, it + * is possible to change these calculations in the future. + */ +int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { + 100, /* 512B */ + 100, /* 1K */ + 98, /* 2K */ + 95, /* 4K */ + 90, /* 8K */ + 80, /* 16K */ + 70, /* 32K */ + 60, /* 64K */ + 50, /* 128K */ + 40, /* 256K */ + 30, /* 512K */ + 20, /* 1M */ + 15, /* 2M */ + 10, /* 4M */ + 5, /* 8M */ + 0 /* 16M */ +}; + +/* + * Calclate the metaslab's fragmentation metric. A return value + * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does + * not support this metric. Otherwise, the return value should be in the + * range [0, 100]. */ static uint64_t -metaslab_weight_factor(metaslab_t *msp) +metaslab_fragmentation(metaslab_t *msp) { - uint64_t factor = 0; - uint64_t sectors; - int i; + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + uint64_t fragmentation = 0; + uint64_t total = 0; + boolean_t feature_enabled = spa_feature_is_enabled(spa, + SPA_FEATURE_SPACEMAP_HISTOGRAM); + + if (!feature_enabled) + return (ZFS_FRAG_INVALID); /* - * A null space map means that the entire metaslab is free, - * calculate a weight factor that spans the entire size of the - * metaslab. + * A null space map means that the entire metaslab is free + * and thus is not fragmented. */ - if (msp->ms_sm == NULL) { + if (msp->ms_sm == NULL) + return (0); + + /* + * If this metaslab's space_map has not been upgraded, flag it + * so that we upgrade next time we encounter it. + */ + if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { + uint64_t txg = spa_syncing_txg(spa); vdev_t *vd = msp->ms_group->mg_vd; - i = highbit64(msp->ms_size) - 1; - sectors = msp->ms_size >> vd->vdev_ashift; - return (sectors * i * vd->vdev_ashift); + msp->ms_condense_wanted = B_TRUE; + vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); + spa_dbgmsg(spa, "txg %llu, requesting force condense: " + "msp %p, vd %p", txg, msp, vd); + return (ZFS_FRAG_INVALID); } - if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) - return (0); + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + uint64_t space = 0; + uint8_t shift = msp->ms_sm->sm_shift; + int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, + FRAGMENTATION_TABLE_SIZE - 1); - for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE(msp->ms_sm); i++) { if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) continue; - /* - * Determine the number of sm_shift sectors in the region - * indicated by the histogram. For example, given an - * sm_shift value of 9 (512 bytes) and i = 4 then we know - * that we're looking at an 8K region in the histogram - * (i.e. 9 + 4 = 13, 2^13 = 8192). To figure out the - * number of sm_shift sectors (512 bytes in this example), - * we would take 8192 / 512 = 16. Since the histogram - * is offset by sm_shift we can simply use the value of - * of i to calculate this (i.e. 2^i = 16 where i = 4). - */ - sectors = msp->ms_sm->sm_phys->smp_histogram[i] << i; - factor += (i + msp->ms_sm->sm_shift) * sectors; + space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); + total += space; + + ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); + fragmentation += space * zfs_frag_table[idx]; } - return (factor * msp->ms_sm->sm_shift); + + if (total > 0) + fragmentation /= total; + ASSERT3U(fragmentation, <=, 100); + return (fragmentation); } +/* + * Compute a weight -- a selection preference value -- for the given metaslab. + * This is based on the amount of free space, the level of fragmentation, + * the LBA range, and whether the metaslab is loaded. + */ static uint64_t metaslab_weight(metaslab_t *msp) { @@ -1149,6 +1438,29 @@ metaslab_weight(metaslab_t *msp) * The baseline weight is the metaslab's free space. */ space = msp->ms_size - space_map_allocated(msp->ms_sm); + + msp->ms_fragmentation = metaslab_fragmentation(msp); + if (metaslab_fragmentation_factor_enabled && + msp->ms_fragmentation != ZFS_FRAG_INVALID) { + /* + * Use the fragmentation information to inversely scale + * down the baseline weight. We need to ensure that we + * don't exclude this metaslab completely when it's 100% + * fragmented. To avoid this we reduce the fragmented value + * by 1. + */ + space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; + + /* + * If space < SPA_MINBLOCKSIZE, then we will not allocate from + * this metaslab again. The fragmentation metric may have + * decreased the space to something smaller than + * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE + * so that we can consume any remaining space. + */ + if (space > 0 && space < SPA_MINBLOCKSIZE) + space = SPA_MINBLOCKSIZE; + } weight = space; /* @@ -1160,19 +1472,19 @@ metaslab_weight(metaslab_t *msp) * In effect, this means that we'll select the metaslab with the most * free bandwidth rather than simply the one with the most free space. */ - weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; - ASSERT(weight >= space && weight <= 2 * space); - - msp->ms_factor = metaslab_weight_factor(msp); - if (metaslab_weight_factor_enable) - weight += msp->ms_factor; + if (metaslab_lba_weighting_enabled) { + weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; + ASSERT(weight >= space && weight <= 2 * space); + } - if (msp->ms_loaded && !msp->ms_ops->msop_fragmented(msp)) { - /* - * If this metaslab is one we're actively using, adjust its - * weight to make it preferable to any inactive metaslab so - * we'll polish it off. - */ + /* + * If this metaslab is one we're actively using, adjust its + * weight to make it preferable to any inactive metaslab so + * we'll polish it off. If the fragmentation on this metaslab + * has exceed our threshold, then don't mark it active. + */ + if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && + msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); } @@ -1257,9 +1569,16 @@ metaslab_group_preload(metaslab_group_t while (msp != NULL) { metaslab_t *msp_next = AVL_NEXT(t, msp); - /* If we have reached our preload limit then we're done */ - if (++m > metaslab_preload_limit) - break; + /* + * We preload only the maximum number of metaslabs specified + * by metaslab_preload_limit. If a metaslab is being forced + * to condense then we preload it too. This will ensure + * that force condensing happens in the next txg. + */ + if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { + msp = msp_next; + continue; + } /* * We must drop the metaslab group lock here to preserve @@ -1327,11 +1646,12 @@ metaslab_should_condense(metaslab_t *msp /* * Use the ms_size_tree range tree, which is ordered by size, to - * obtain the largest segment in the free tree. If the tree is empty - * then we should condense the map. + * obtain the largest segment in the free tree. We always condense + * metaslabs that are empty and metaslabs for which a condense + * request has been made. */ rs = avl_last(&msp->ms_size_tree); - if (rs == NULL) + if (rs == NULL || msp->ms_condense_wanted) return (B_TRUE); /* @@ -1372,9 +1692,14 @@ metaslab_condense(metaslab_t *msp, uint6 ASSERT3U(spa_sync_pass(spa), ==, 1); ASSERT(msp->ms_loaded); + spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, " - "smp size %llu, segments %lu", txg, msp->ms_id, msp, - space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root)); + "smp size %llu, segments %lu, forcing condense=%s", txg, + msp->ms_id, msp, space_map_length(msp->ms_sm), + avl_numnodes(&msp->ms_tree->rt_root), + msp->ms_condense_wanted ? "TRUE" : "FALSE"); + + msp->ms_condense_wanted = B_FALSE; /* * Create an range tree that is 100% allocated. We remove segments @@ -1467,8 +1792,14 @@ metaslab_sync(metaslab_t *msp, uint64_t ASSERT3P(*freetree, !=, NULL); ASSERT3P(*freed_tree, !=, NULL); + /* + * Normally, we don't want to process a metaslab if there + * are no allocations or frees to perform. However, if the metaslab + * is being forced to condense we need to let it through. + */ if (range_tree_space(alloctree) == 0 && - range_tree_space(*freetree) == 0) + range_tree_space(*freetree) == 0 && + !msp->ms_condense_wanted) return; /* @@ -1505,8 +1836,9 @@ metaslab_sync(metaslab_t *msp, uint64_t space_map_write(msp->ms_sm, *freetree, SM_FREE, tx); } - range_tree_vacate(alloctree, NULL, NULL); - + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); + metaslab_group_histogram_remove(mg, msp); if (msp->ms_loaded) { /* * When the space map is loaded, we have an accruate @@ -1526,6 +1858,9 @@ metaslab_sync(metaslab_t *msp, uint64_t */ space_map_histogram_add(msp->ms_sm, *freetree, tx); } + metaslab_group_histogram_add(mg, msp); + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); /* * For sync pass 1, we avoid traversing this txg's free range tree @@ -1538,6 +1873,7 @@ metaslab_sync(metaslab_t *msp, uint64_t } else { range_tree_vacate(*freetree, range_tree_add, *freed_tree); } + range_tree_vacate(alloctree, NULL, NULL); ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); @@ -1648,13 +1984,13 @@ metaslab_sync_done(metaslab_t *msp, uint metaslab_group_sort(mg, msp, metaslab_weight(msp)); mutex_exit(&msp->ms_lock); - } void metaslab_sync_reassess(metaslab_group_t *mg) { metaslab_group_alloc_update(mg); + mg->mg_fragmentation = metaslab_group_fragmentation(mg); /* * Preload the next potential metaslabs @@ -1916,9 +2252,7 @@ top: */ if ((vd->vdev_stat.vs_write_errors > 0 || vd->vdev_state < VDEV_STATE_HEALTHY) && - d == 0 && dshift == 3 && - !(zfs_write_to_degraded && vd->vdev_state == - VDEV_STATE_DEGRADED)) { + d == 0 && dshift == 3 && vd->vdev_children == 0) { all_zero = B_FALSE; goto next; } @@ -1943,7 +2277,7 @@ top: * over- or under-used relative to the pool, * and set an allocation bias to even it out. */ - if (mc->mc_aliquot == 0) { + if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { vdev_stat_t *vs = &vd->vdev_stat; int64_t vu, cu; @@ -1965,6 +2299,8 @@ top: */ mg->mg_bias = ((cu - vu) * (int64_t)mg->mg_aliquot) / 100; + } else if (!metaslab_bias_enabled) { + mg->mg_bias = 0; } if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c Wed Jul 23 05:40:28 2014 (r269009) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/range_tree.c Wed Jul 23 08:00:34 2014 (r269010) @@ -81,6 +81,7 @@ range_tree_stat_incr(range_tree_t *rt, r uint64_t size = rs->rs_end - rs->rs_start; int idx = highbit64(size) - 1; + ASSERT(size != 0); ASSERT3U(idx, <, sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); @@ -95,6 +96,7 @@ range_tree_stat_decr(range_tree_t *rt, r uint64_t size = rs->rs_end - rs->rs_start; int idx = highbit64(size) - 1; + ASSERT(size != 0); ASSERT3U(idx, <, sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c Wed Jul 23 05:40:28 2014 (r269009) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/spa.c Wed Jul 23 08:00:34 2014 (r269010) @@ -194,12 +194,10 @@ spa_prop_get_config(spa_t *spa, nvlist_t { vdev_t *rvd = spa->spa_root_vdev; dsl_pool_t *pool = spa->spa_dsl_pool; - uint64_t size; - uint64_t alloc; - uint64_t space; - uint64_t cap, version; + uint64_t size, alloc, cap, version; zprop_source_t src = ZPROP_SRC_NONE; spa_config_dirent_t *dp; + metaslab_class_t *mc = spa_normal_class(spa); ASSERT(MUTEX_HELD(&spa->spa_props_lock)); @@ -212,14 +210,10 @@ spa_prop_get_config(spa_t *spa, nvlist_t spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, size - alloc, src); - space = 0; - for (int c = 0; c < rvd->vdev_children; c++) { - vdev_t *tvd = rvd->vdev_child[c]; - space += tvd->vdev_max_asize - tvd->vdev_asize; - } - spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space, - src); - + spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, + metaslab_class_fragmentation(mc), src); + spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, + metaslab_class_expandable_space(mc), src); spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, (spa_mode(spa) == FREAD), src); Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/space_map.c ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/space_map.c Wed Jul 23 05:40:28 2014 (r269009) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/space_map.c Wed Jul 23 08:00:34 2014 (r269010) @@ -202,10 +202,10 @@ space_map_histogram_add(space_map_t *sm, * reached the maximum bucket size. Accumulate all ranges * larger than the max bucket size into the last bucket. */ - if (idx < SPACE_MAP_HISTOGRAM_SIZE(sm) - 1) { + if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { ASSERT3U(idx + sm->sm_shift, ==, i); idx++; - ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE(sm)); + ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); } } } Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab.h ============================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab.h Wed Jul 23 05:40:28 2014 (r269009) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/metaslab.h Wed Jul 23 08:00:34 2014 (r269010) @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2014 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_H @@ -38,23 +38,22 @@ extern "C" { typedef struct metaslab_ops { uint64_t (*msop_alloc)(metaslab_t *msp, uint64_t size); - boolean_t (*msop_fragmented)(metaslab_t *msp); } metaslab_ops_t; extern metaslab_ops_t *zfs_metaslab_ops; -metaslab_t *metaslab_init(metaslab_group_t *mg, uint64_t id, - uint64_t object, uint64_t txg); -void metaslab_fini(metaslab_t *msp); - -void metaslab_load_wait(metaslab_t *msp); -int metaslab_load(metaslab_t *msp); -void metaslab_unload(metaslab_t *msp); - -void metaslab_sync(metaslab_t *msp, uint64_t txg); -void metaslab_sync_done(metaslab_t *msp, uint64_t txg); *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***