From owner-svn-src-all@FreeBSD.ORG Sat Aug 28 08:59:55 2010 Return-Path: Delivered-To: svn-src-all@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id EA73D10656AB; Sat, 28 Aug 2010 08:59:55 +0000 (UTC) (envelope-from mm@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id D755D8FC18; Sat, 28 Aug 2010 08:59:55 +0000 (UTC) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.3/8.14.3) with ESMTP id o7S8xt2v024931; Sat, 28 Aug 2010 08:59:55 GMT (envelope-from mm@svn.freebsd.org) Received: (from mm@localhost) by svn.freebsd.org (8.14.3/8.14.3/Submit) id o7S8xtM6024918; Sat, 28 Aug 2010 08:59:55 GMT (envelope-from mm@svn.freebsd.org) Message-Id: <201008280859.o7S8xtM6024918@svn.freebsd.org> From: Martin Matuska Date: Sat, 28 Aug 2010 08:59:55 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r211931 - in head: cddl/contrib/opensolaris/cmd/zdb sys/cddl/contrib/opensolaris/uts/common/fs/zfs sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sat, 28 Aug 2010 08:59:56 -0000 Author: mm Date: Sat Aug 28 08:59:55 2010 New Revision: 211931 URL: http://svn.freebsd.org/changeset/base/211931 Log: Update ZFS metaslab code from OpenSolaris. This provides a noticeable write speedup, especially on pools with less than 30% of free space. Detailed information (OpenSolaris onnv changesets and Bug IDs): 11146:7e58f40bcb1c 6826241 Sync write IOPS drops dramatically during TXG sync 6869229 zfs should switch to shiny new metaslabs more frequently 11728:59fdb3b856f6 6918420 zdb -m has issues printing metaslab statistics 12047:7c1fcc8419ca 6917066 zfs block picking can be improved Approved by: delphij (mentor) Obtained from: OpenSolaris (Bug ID 6826241, 6869229, 6918420, 6917066) MFC after: 2 weeks Modified: head/cddl/contrib/opensolaris/cmd/zdb/zdb.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c Modified: head/cddl/contrib/opensolaris/cmd/zdb/zdb.c ============================================================================== --- head/cddl/contrib/opensolaris/cmd/zdb/zdb.c Sat Aug 28 08:57:15 2010 (r211930) +++ head/cddl/contrib/opensolaris/cmd/zdb/zdb.c Sat Aug 28 08:59:55 2010 (r211931) @@ -491,35 +491,37 @@ dump_metaslab_stats(metaslab_t *msp) static void dump_metaslab(metaslab_t *msp) { - char freebuf[5]; - space_map_obj_t *smo = &msp->ms_smo; vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; + space_map_t *sm = &msp->ms_map; + space_map_obj_t *smo = &msp->ms_smo; + char freebuf[5]; - nicenum(msp->ms_map.sm_size - smo->smo_alloc, freebuf); + nicenum(sm->sm_size - smo->smo_alloc, freebuf); (void) printf( "\tvdev %5llu offset %12llx spacemap %6llu free %5s\n", - (u_longlong_t)vd->vdev_id, (u_longlong_t)msp->ms_map.sm_start, - (u_longlong_t)smo->smo_object, freebuf); + (u_longlong_t)(sm->sm_start / sm->sm_size), + (u_longlong_t)sm->sm_start, (u_longlong_t)smo->smo_object, freebuf); if (dump_opt['m'] > 1) { mutex_enter(&msp->ms_lock); - VERIFY(space_map_load(&msp->ms_map, zfs_metaslab_ops, - SM_FREE, &msp->ms_smo, spa->spa_meta_objset) == 0); + space_map_load_wait(sm); + if (!sm->sm_loaded) + VERIFY(space_map_load(sm, zfs_metaslab_ops, + SM_FREE, smo, spa->spa_meta_objset) == 0); dump_metaslab_stats(msp); - space_map_unload(&msp->ms_map); + space_map_unload(sm); mutex_exit(&msp->ms_lock); } if (dump_opt['d'] > 5 || dump_opt['m'] > 2) { - ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift)); + ASSERT(sm->sm_size == (1ULL << vd->vdev_ms_shift)); mutex_enter(&msp->ms_lock); - dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map); + dump_spacemap(spa->spa_meta_objset, smo, sm); mutex_exit(&msp->ms_lock); } - } static void Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c Sat Aug 28 08:57:15 2010 (r211930) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c Sat Aug 28 08:59:55 2010 (r211931) @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -37,7 +36,7 @@ uint64_t metaslab_gang_bang = SPA_MAXBLO /* * Minimum size which forces the dynamic allocator to change - * it's allocation strategy. Once the space map cannot satisfy + * it's allocation strategy. Once the space map cannot satisfy * an allocation of this size then it switches to using more * aggressive strategy (i.e search by size rather than offset). */ @@ -49,7 +48,23 @@ uint64_t metaslab_df_alloc_threshold = S * Once the space_map's free space drops below this level we dynamically * switch to using best-fit allocations. */ -int metaslab_df_free_pct = 30; +int metaslab_df_free_pct = 4; + +/* + * A metaslab is considered "free" if it contains a contiguous + * segment which is greater than metaslab_min_alloc_size. + */ +uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; + +/* + * Max number of space_maps to prefetch. + */ +int metaslab_prefetch_limit = SPA_DVAS_PER_BP; + +/* + * Percentage bonus multiplier for metaslabs that are in the bonus area. + */ +int metaslab_smo_bonus_pct = 150; /* * ========================================================================== @@ -219,6 +234,32 @@ metaslab_group_sort(metaslab_group_t *mg } /* + * ========================================================================== + * Common allocator routines + * ========================================================================== + */ +static int +metaslab_segsize_compare(const void *x1, const void *x2) +{ + const space_seg_t *s1 = x1; + const space_seg_t *s2 = x2; + uint64_t ss_size1 = s1->ss_end - s1->ss_start; + uint64_t ss_size2 = s2->ss_end - s2->ss_start; + + if (ss_size1 < ss_size2) + return (-1); + if (ss_size1 > ss_size2) + return (1); + + if (s1->ss_start < s2->ss_start) + return (-1); + if (s1->ss_start > s2->ss_start) + return (1); + + return (0); +} + +/* * This is a helper function that can be used by the allocator to find * a suitable block to allocate. This will search the specified AVL * tree looking for a block that matches the specified criteria. @@ -258,68 +299,58 @@ metaslab_block_picker(avl_tree_t *t, uin return (metaslab_block_picker(t, cursor, size, align)); } -/* - * ========================================================================== - * The first-fit block allocator - * ========================================================================== - */ static void -metaslab_ff_load(space_map_t *sm) +metaslab_pp_load(space_map_t *sm) { + space_seg_t *ss; + ASSERT(sm->sm_ppd == NULL); sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); - sm->sm_pp_root = NULL; + + sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); + avl_create(sm->sm_pp_root, metaslab_segsize_compare, + sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); + + for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) + avl_add(sm->sm_pp_root, ss); } static void -metaslab_ff_unload(space_map_t *sm) +metaslab_pp_unload(space_map_t *sm) { + void *cookie = NULL; + kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); sm->sm_ppd = NULL; -} -static uint64_t -metaslab_ff_alloc(space_map_t *sm, uint64_t size) -{ - avl_tree_t *t = &sm->sm_root; - uint64_t align = size & -size; - uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; + while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { + /* tear down the tree */ + } - return (metaslab_block_picker(t, cursor, size, align)); + avl_destroy(sm->sm_pp_root); + kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); + sm->sm_pp_root = NULL; } /* ARGSUSED */ static void -metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size) +metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size) { /* No need to update cursor */ } /* ARGSUSED */ static void -metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size) +metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size) { /* No need to update cursor */ } -static space_map_ops_t metaslab_ff_ops = { - metaslab_ff_load, - metaslab_ff_unload, - metaslab_ff_alloc, - metaslab_ff_claim, - metaslab_ff_free, - NULL /* maxsize */ -}; - /* - * Dynamic block allocator - - * Uses the first fit allocation scheme until space get low and then - * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold - * and metaslab_df_free_pct to determine when to switch the allocation scheme. + * Return the maximum contiguous segment within the metaslab. */ - uint64_t -metaslab_df_maxsize(space_map_t *sm) +metaslab_pp_maxsize(space_map_t *sm) { avl_tree_t *t = sm->sm_pp_root; space_seg_t *ss; @@ -330,67 +361,53 @@ metaslab_df_maxsize(space_map_t *sm) return (ss->ss_end - ss->ss_start); } -static int -metaslab_df_seg_compare(const void *x1, const void *x2) +/* + * ========================================================================== + * The first-fit block allocator + * ========================================================================== + */ +static uint64_t +metaslab_ff_alloc(space_map_t *sm, uint64_t size) { - const space_seg_t *s1 = x1; - const space_seg_t *s2 = x2; - uint64_t ss_size1 = s1->ss_end - s1->ss_start; - uint64_t ss_size2 = s2->ss_end - s2->ss_start; - - if (ss_size1 < ss_size2) - return (-1); - if (ss_size1 > ss_size2) - return (1); - - if (s1->ss_start < s2->ss_start) - return (-1); - if (s1->ss_start > s2->ss_start) - return (1); + avl_tree_t *t = &sm->sm_root; + uint64_t align = size & -size; + uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; - return (0); + return (metaslab_block_picker(t, cursor, size, align)); } -static void -metaslab_df_load(space_map_t *sm) +/* ARGSUSED */ +boolean_t +metaslab_ff_fragmented(space_map_t *sm) { - space_seg_t *ss; - - ASSERT(sm->sm_ppd == NULL); - sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); - - sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); - avl_create(sm->sm_pp_root, metaslab_df_seg_compare, - sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); - - for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) - avl_add(sm->sm_pp_root, ss); + return (B_TRUE); } -static void -metaslab_df_unload(space_map_t *sm) -{ - void *cookie = NULL; - - kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); - sm->sm_ppd = NULL; - - while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { - /* tear down the tree */ - } - - avl_destroy(sm->sm_pp_root); - kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); - sm->sm_pp_root = NULL; -} +static space_map_ops_t metaslab_ff_ops = { + metaslab_pp_load, + metaslab_pp_unload, + metaslab_ff_alloc, + metaslab_pp_claim, + metaslab_pp_free, + metaslab_pp_maxsize, + metaslab_ff_fragmented +}; +/* + * ========================================================================== + * Dynamic block allocator - + * Uses the first fit allocation scheme until space get low and then + * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold + * and metaslab_df_free_pct to determine when to switch the allocation scheme. + * ========================================================================== + */ static uint64_t metaslab_df_alloc(space_map_t *sm, uint64_t size) { avl_tree_t *t = &sm->sm_root; uint64_t align = size & -size; uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; - uint64_t max_size = metaslab_df_maxsize(sm); + uint64_t max_size = metaslab_pp_maxsize(sm); int free_pct = sm->sm_space * 100 / sm->sm_size; ASSERT(MUTEX_HELD(sm->sm_lock)); @@ -412,30 +429,158 @@ metaslab_df_alloc(space_map_t *sm, uint6 return (metaslab_block_picker(t, cursor, size, 1ULL)); } -/* ARGSUSED */ -static void -metaslab_df_claim(space_map_t *sm, uint64_t start, uint64_t size) +static boolean_t +metaslab_df_fragmented(space_map_t *sm) { - /* No need to update cursor */ -} + uint64_t max_size = metaslab_pp_maxsize(sm); + int free_pct = sm->sm_space * 100 / sm->sm_size; -/* ARGSUSED */ -static void -metaslab_df_free(space_map_t *sm, uint64_t start, uint64_t size) -{ - /* No need to update cursor */ + if (max_size >= metaslab_df_alloc_threshold && + free_pct >= metaslab_df_free_pct) + return (B_FALSE); + + return (B_TRUE); } static space_map_ops_t metaslab_df_ops = { - metaslab_df_load, - metaslab_df_unload, + metaslab_pp_load, + metaslab_pp_unload, metaslab_df_alloc, - metaslab_df_claim, - metaslab_df_free, - metaslab_df_maxsize + metaslab_pp_claim, + metaslab_pp_free, + metaslab_pp_maxsize, + metaslab_df_fragmented +}; + +/* + * ========================================================================== + * Other experimental allocators + * ========================================================================== + */ +static uint64_t +metaslab_cdf_alloc(space_map_t *sm, uint64_t size) +{ + avl_tree_t *t = &sm->sm_root; + uint64_t *cursor = (uint64_t *)sm->sm_ppd; + uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1; + uint64_t max_size = metaslab_pp_maxsize(sm); + uint64_t rsize = size; + uint64_t offset = 0; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); + + if (max_size < size) + return (-1ULL); + + ASSERT3U(*extent_end, >=, *cursor); + + /* + * If we're running low on space switch to using the size + * sorted AVL tree (best-fit). + */ + if ((*cursor + size) > *extent_end) { + + t = sm->sm_pp_root; + *cursor = *extent_end = 0; + + if (max_size > 2 * SPA_MAXBLOCKSIZE) + rsize = MIN(metaslab_min_alloc_size, max_size); + offset = metaslab_block_picker(t, extent_end, rsize, 1ULL); + if (offset != -1) + *cursor = offset + size; + } else { + offset = metaslab_block_picker(t, cursor, rsize, 1ULL); + } + ASSERT3U(*cursor, <=, *extent_end); + return (offset); +} + +static boolean_t +metaslab_cdf_fragmented(space_map_t *sm) +{ + uint64_t max_size = metaslab_pp_maxsize(sm); + + if (max_size > (metaslab_min_alloc_size * 10)) + return (B_FALSE); + return (B_TRUE); +} + +static space_map_ops_t metaslab_cdf_ops = { + metaslab_pp_load, + metaslab_pp_unload, + metaslab_cdf_alloc, + metaslab_pp_claim, + metaslab_pp_free, + metaslab_pp_maxsize, + metaslab_cdf_fragmented +}; + +uint64_t metaslab_ndf_clump_shift = 4; + +static uint64_t +metaslab_ndf_alloc(space_map_t *sm, uint64_t size) +{ + avl_tree_t *t = &sm->sm_root; + avl_index_t where; + space_seg_t *ss, ssearch; + uint64_t hbit = highbit(size); + uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1; + uint64_t max_size = metaslab_pp_maxsize(sm); + + ASSERT(MUTEX_HELD(sm->sm_lock)); + ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); + + if (max_size < size) + return (-1ULL); + + ssearch.ss_start = *cursor; + ssearch.ss_end = *cursor + size; + + ss = avl_find(t, &ssearch, &where); + if (ss == NULL || (ss->ss_start + size > ss->ss_end)) { + t = sm->sm_pp_root; + + ssearch.ss_start = 0; + ssearch.ss_end = MIN(max_size, + 1ULL << (hbit + metaslab_ndf_clump_shift)); + ss = avl_find(t, &ssearch, &where); + if (ss == NULL) + ss = avl_nearest(t, where, AVL_AFTER); + ASSERT(ss != NULL); + } + + if (ss != NULL) { + if (ss->ss_start + size <= ss->ss_end) { + *cursor = ss->ss_start + size; + return (ss->ss_start); + } + } + return (-1ULL); +} + +static boolean_t +metaslab_ndf_fragmented(space_map_t *sm) +{ + uint64_t max_size = metaslab_pp_maxsize(sm); + + if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift)) + return (B_FALSE); + return (B_TRUE); +} + + +static space_map_ops_t metaslab_ndf_ops = { + metaslab_pp_load, + metaslab_pp_unload, + metaslab_ndf_alloc, + metaslab_pp_claim, + metaslab_pp_free, + metaslab_pp_maxsize, + metaslab_ndf_fragmented }; -space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; +space_map_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops; /* * ========================================================================== @@ -522,7 +667,6 @@ metaslab_fini(metaslab_t *msp) #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) #define METASLAB_ACTIVE_MASK \ (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) -#define METASLAB_SMO_BONUS_MULTIPLIER 2 static uint64_t metaslab_weight(metaslab_t *msp) @@ -555,25 +699,60 @@ metaslab_weight(metaslab_t *msp) ASSERT(weight >= space && weight <= 2 * space); /* - * For locality, assign higher weight to metaslabs we've used before. + * For locality, assign higher weight to metaslabs which have + * a lower offset than what we've already activated. */ - if (smo->smo_object != 0) - weight *= METASLAB_SMO_BONUS_MULTIPLIER; + if (sm->sm_start <= mg->mg_bonus_area) + weight *= (metaslab_smo_bonus_pct / 100); ASSERT(weight >= space && - weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space); + weight <= 2 * (metaslab_smo_bonus_pct / 100) * space); + + if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) { + /* + * If this metaslab is one we're actively using, adjust its + * weight to make it preferable to any inactive metaslab so + * we'll polish it off. + */ + weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); + } + return (weight); +} + +static void +metaslab_prefetch(metaslab_group_t *mg) +{ + spa_t *spa = mg->mg_vd->vdev_spa; + metaslab_t *msp; + avl_tree_t *t = &mg->mg_metaslab_tree; + int m; + + mutex_enter(&mg->mg_lock); /* - * If this metaslab is one we're actively using, adjust its weight to - * make it preferable to any inactive metaslab so we'll polish it off. + * Prefetch the next potential metaslabs */ - weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); + for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) { + space_map_t *sm = &msp->ms_map; + space_map_obj_t *smo = &msp->ms_smo; - return (weight); + /* If we have reached our prefetch limit then we're done */ + if (m >= metaslab_prefetch_limit) + break; + + if (!sm->sm_loaded && smo->smo_object != 0) { + mutex_exit(&mg->mg_lock); + dmu_prefetch(spa->spa_meta_objset, smo->smo_object, + 0ULL, smo->smo_objsize); + mutex_enter(&mg->mg_lock); + } + } + mutex_exit(&mg->mg_lock); } static int metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size) { + metaslab_group_t *mg = msp->ms_group; space_map_t *sm = &msp->ms_map; space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; @@ -588,6 +767,15 @@ metaslab_activate(metaslab_t *msp, uint6 } /* + * Track the bonus area as we activate new metaslabs. + */ + if (sm->sm_start > mg->mg_bonus_area) { + mutex_enter(&mg->mg_lock); + mg->mg_bonus_area = sm->sm_start; + mutex_exit(&mg->mg_lock); + } + + /* * If we were able to load the map then make sure * that this map is still able to satisfy our request. */ @@ -773,6 +961,32 @@ metaslab_sync_done(metaslab_t *msp, uint mutex_exit(&msp->ms_lock); } +void +metaslab_sync_reassess(metaslab_group_t *mg) +{ + vdev_t *vd = mg->mg_vd; + + /* + * Re-evaluate all metaslabs which have lower offsets than the + * bonus area. + */ + for (int m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + if (msp->ms_map.sm_start > mg->mg_bonus_area) + break; + + mutex_enter(&msp->ms_lock); + metaslab_group_sort(mg, msp, metaslab_weight(msp)); + mutex_exit(&msp->ms_lock); + } + + /* + * Prefetch the next potential metaslabs + */ + metaslab_prefetch(mg); +} + static uint64_t metaslab_distance(metaslab_t *msp, dva_t *dva) { @@ -868,7 +1082,7 @@ metaslab_group_alloc(metaslab_group_t *m if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL) break; - metaslab_passivate(msp, size - 1); + metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); mutex_exit(&msp->ms_lock); } Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c Sat Aug 28 08:57:15 2010 (r211930) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c Sat Aug 28 08:59:55 2010 (r211931) @@ -74,35 +74,38 @@ enum zti_modes { zti_mode_fixed, /* value is # of threads (min 1) */ zti_mode_online_percent, /* value is % of online CPUs */ zti_mode_tune, /* fill from zio_taskq_tune_* */ + zti_mode_null, /* don't create a taskq */ zti_nmodes }; -#define ZTI_THREAD_FIX(n) { zti_mode_fixed, (n) } -#define ZTI_THREAD_PCT(n) { zti_mode_online_percent, (n) } -#define ZTI_THREAD_TUNE { zti_mode_tune, 0 } +#define ZTI_FIX(n) { zti_mode_fixed, (n) } +#define ZTI_PCT(n) { zti_mode_online_percent, (n) } +#define ZTI_TUNE { zti_mode_tune, 0 } +#define ZTI_NULL { zti_mode_null, 0 } -#define ZTI_THREAD_ONE ZTI_THREAD_FIX(1) +#define ZTI_ONE ZTI_FIX(1) typedef struct zio_taskq_info { - const char *zti_name; - struct { - enum zti_modes zti_mode; - uint_t zti_value; - } zti_nthreads[ZIO_TASKQ_TYPES]; + enum zti_modes zti_mode; + uint_t zti_value; } zio_taskq_info_t; static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { - "issue", "intr" + "issue", "issue_high", "intr", "intr_high" }; -const zio_taskq_info_t zio_taskqs[ZIO_TYPES] = { - /* ISSUE INTR */ - { "spa_zio_null", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, - { "spa_zio_read", { ZTI_THREAD_FIX(8), ZTI_THREAD_TUNE } }, - { "spa_zio_write", { ZTI_THREAD_TUNE, ZTI_THREAD_FIX(8) } }, - { "spa_zio_free", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, - { "spa_zio_claim", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, - { "spa_zio_ioctl", { ZTI_THREAD_ONE, ZTI_THREAD_ONE } }, +/* + * Define the taskq threads for the following I/O types: + * NULL, READ, WRITE, FREE, CLAIM, and IOCTL + */ +const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { + /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, + { ZTI_FIX(8), ZTI_NULL, ZTI_TUNE, ZTI_NULL }, + { ZTI_TUNE, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, }; enum zti_modes zio_taskq_tune_mode = zti_mode_online_percent; @@ -581,14 +584,14 @@ spa_activate(spa_t *spa, int mode) spa->spa_log_class = metaslab_class_create(zfs_metaslab_ops); for (int t = 0; t < ZIO_TYPES; t++) { - const zio_taskq_info_t *ztip = &zio_taskqs[t]; for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { - enum zti_modes mode = ztip->zti_nthreads[q].zti_mode; - uint_t value = ztip->zti_nthreads[q].zti_value; + const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; + enum zti_modes mode = ztip->zti_mode; + uint_t value = ztip->zti_value; char name[32]; (void) snprintf(name, sizeof (name), - "%s_%s", ztip->zti_name, zio_taskq_types[q]); + "%s_%s", zio_type_name[t], zio_taskq_types[q]); if (mode == zti_mode_tune) { mode = zio_taskq_tune_mode; @@ -613,6 +616,10 @@ spa_activate(spa_t *spa, int mode) TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); break; + case zti_mode_null: + spa->spa_zio_taskq[t][q] = NULL; + break; + case zti_mode_tune: default: panic("unrecognized mode for " @@ -659,7 +666,8 @@ spa_deactivate(spa_t *spa) for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { - taskq_destroy(spa->spa_zio_taskq[t][q]); + if (spa->spa_zio_taskq[t][q] != NULL) + taskq_destroy(spa->spa_zio_taskq[t][q]); spa->spa_zio_taskq[t][q] = NULL; } } Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c Sat Aug 28 08:57:15 2010 (r211930) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c Sat Aug 28 08:59:55 2010 (r211931) @@ -368,10 +368,8 @@ space_map_unload(space_map_t *sm) uint64_t space_map_maxsize(space_map_t *sm) { - if (sm->sm_loaded && sm->sm_ops != NULL) - return (sm->sm_ops->smop_max(sm)); - else - return (-1ULL); + ASSERT(sm->sm_ops != NULL); + return (sm->sm_ops->smop_max(sm)); } uint64_t Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h Sat Aug 28 08:57:15 2010 (r211930) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h Sat Aug 28 08:59:55 2010 (r211931) @@ -46,6 +46,7 @@ extern metaslab_t *metaslab_init(metasla extern void metaslab_fini(metaslab_t *msp); extern void metaslab_sync(metaslab_t *msp, uint64_t txg); extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg); +extern void metaslab_sync_reassess(metaslab_group_t *mg); #define METASLAB_HINTBP_FAVOR 0x0 #define METASLAB_HINTBP_AVOID 0x1 Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h Sat Aug 28 08:57:15 2010 (r211930) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h Sat Aug 28 08:59:55 2010 (r211931) @@ -46,6 +46,7 @@ struct metaslab_group { kmutex_t mg_lock; avl_tree_t mg_metaslab_tree; uint64_t mg_aliquot; + uint64_t mg_bonus_area; int64_t mg_bias; metaslab_class_t *mg_class; vdev_t *mg_vd; Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h Sat Aug 28 08:57:15 2010 (r211930) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h Sat Aug 28 08:59:55 2010 (r211931) @@ -87,7 +87,9 @@ typedef enum spa_log_state { enum zio_taskq_type { ZIO_TASKQ_ISSUE = 0, + ZIO_TASKQ_ISSUE_HIGH, ZIO_TASKQ_INTERRUPT, + ZIO_TASKQ_INTERRUPT_HIGH, ZIO_TASKQ_TYPES }; Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h Sat Aug 28 08:57:15 2010 (r211930) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h Sat Aug 28 08:59:55 2010 (r211931) @@ -77,6 +77,7 @@ struct space_map_ops { void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size); void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size); uint64_t (*smop_max)(space_map_t *sm); + boolean_t (*smop_fragmented)(space_map_t *sm); }; /* Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h Sat Aug 28 08:57:15 2010 (r211930) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h Sat Aug 28 08:59:55 2010 (r211931) @@ -107,14 +107,15 @@ enum zio_compress { #define ZIO_PRIORITY_NOW (zio_priority_table[0]) #define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1]) #define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2]) -#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[3]) -#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[4]) -#define ZIO_PRIORITY_FREE (zio_priority_table[5]) -#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[6]) -#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[7]) -#define ZIO_PRIORITY_RESILVER (zio_priority_table[8]) -#define ZIO_PRIORITY_SCRUB (zio_priority_table[9]) -#define ZIO_PRIORITY_TABLE_SIZE 10 +#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[3]) +#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[4]) +#define ZIO_PRIORITY_AGG (zio_priority_table[5]) +#define ZIO_PRIORITY_FREE (zio_priority_table[6]) +#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[7]) +#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[8]) +#define ZIO_PRIORITY_RESILVER (zio_priority_table[9]) +#define ZIO_PRIORITY_SCRUB (zio_priority_table[10]) +#define ZIO_PRIORITY_TABLE_SIZE 11 #define ZIO_FLAG_MUSTSUCCEED 0x00000 #define ZIO_FLAG_CANFAIL 0x00001 Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c Sat Aug 28 08:57:15 2010 (r211930) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c Sat Aug 28 08:59:55 2010 (r211931) @@ -1773,9 +1773,13 @@ void vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; + boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) metaslab_sync_done(msp, txg); + + if (reassess) + metaslab_sync_reassess(vd->vdev_mg); } void Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c Sat Aug 28 08:57:15 2010 (r211930) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c Sat Aug 28 08:59:55 2010 (r211931) @@ -233,7 +233,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, ASSERT(size <= zfs_vdev_aggregation_limit); aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, - zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_NOW, + zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG, flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c Sat Aug 28 08:57:15 2010 (r211930) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c Sat Aug 28 08:59:55 2010 (r211931) @@ -49,11 +49,12 @@ uint8_t zio_priority_table[ZIO_PRIORITY_ 0, /* ZIO_PRIORITY_NOW */ 0, /* ZIO_PRIORITY_SYNC_READ */ 0, /* ZIO_PRIORITY_SYNC_WRITE */ - 6, /* ZIO_PRIORITY_ASYNC_READ */ - 4, /* ZIO_PRIORITY_ASYNC_WRITE */ - 4, /* ZIO_PRIORITY_FREE */ - 0, /* ZIO_PRIORITY_CACHE_FILL */ 0, /* ZIO_PRIORITY_LOG_WRITE */ + 1, /* ZIO_PRIORITY_CACHE_FILL */ + 1, /* ZIO_PRIORITY_AGG */ + 4, /* ZIO_PRIORITY_FREE */ + 4, /* ZIO_PRIORITY_ASYNC_WRITE */ + 6, /* ZIO_PRIORITY_ASYNC_READ */ 10, /* ZIO_PRIORITY_RESILVER */ 20, /* ZIO_PRIORITY_SCRUB */ }; @@ -64,7 +65,9 @@ uint8_t zio_priority_table[ZIO_PRIORITY_ * ========================================================================== */ char *zio_type_name[ZIO_TYPES] = { - "null", "read", "write", "free", "claim", "ioctl" }; + "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", + "zio_ioctl" +}; #define SYNC_PASS_DEFERRED_FREE 1 /* defer frees after this pass */ #define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */ @@ -942,6 +945,7 @@ zio_write_bp_init(zio_t *zio) static void zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q) { + spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; /* @@ -958,7 +962,15 @@ zio_taskq_dispatch(zio_t *zio, enum zio_ if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) t = ZIO_TYPE_NULL; - (void) taskq_dispatch_safe(zio->io_spa->spa_zio_taskq[t][q], + /* + * If this is a high priority I/O, then use the high priority taskq. + */ + if (zio->io_priority == ZIO_PRIORITY_NOW && + spa->spa_zio_taskq[t][q + 1] != NULL) + q++; + + ASSERT3U(q, <, ZIO_TASKQ_TYPES); + (void) taskq_dispatch_safe(spa->spa_zio_taskq[t][q], (task_func_t *)zio_execute, zio, &zio->io_task); }