Date: Tue, 16 Mar 2010 22:17:22 +0000 (UTC) From: Kip Macy <kmacy@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r205231 - in head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs: . sys Message-ID: <201003162217.o2GMHMjU012285@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: kmacy Date: Tue Mar 16 22:17:21 2010 New Revision: 205231 URL: http://svn.freebsd.org/changeset/base/205231 Log: - reduce contention by breaking up ARC state locks in to 16 for data and 16 for metadata - export L2ARC tunables as sysctls - add several kstats to track L2ARC state more precisely - avoid holding a contended lock when atomically incrementing a contended counter (no lock protection needed for atomics) Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Tue Mar 16 21:44:21 2010 (r205230) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Tue Mar 16 22:17:21 2010 (r205231) @@ -131,6 +131,7 @@ #include <sys/kstat.h> #include <sys/sdt.h> +#include <sys/ktr.h> #include <vm/vm_pageout.h> static kmutex_t arc_reclaim_thr_lock; @@ -186,6 +187,11 @@ SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_min, SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN, &zfs_mdcomp_disable, 0, "Disable metadata compression"); +#ifdef ZIO_USE_UMA +extern kmem_cache_t *zio_buf_cache[]; +extern kmem_cache_t *zio_data_buf_cache[]; +#endif + /* * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) @@ -218,13 +224,31 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_di * second level ARC benefit from these fast lookups. */ +#define ARCS_LOCK_PAD 128 +struct arcs_lock { + kmutex_t arcs_lock; +#ifdef _KERNEL + unsigned char pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))]; +#endif +}; + +/* + * must be power of two for mask use to work + * + */ +#define ARC_BUFC_NUMDATALISTS 16 +#define ARC_BUFC_NUMMETADATALISTS 16 +#define ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS+ARC_BUFC_NUMDATALISTS) + typedef struct arc_state { - list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ uint64_t arcs_size; /* total amount of data in this state */ - kmutex_t arcs_mtx; + list_t arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */ + struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(128); } arc_state_t; +#define ARCS_LOCK(s, i) &((s)->arcs_locks[(i)].arcs_lock) + /* The 6 states: */ static arc_state_t ARC_anon; static arc_state_t ARC_mru; @@ -248,7 +272,9 @@ typedef struct arc_stats { kstat_named_t arcstat_mru_ghost_hits; kstat_named_t arcstat_mfu_hits; kstat_named_t arcstat_mfu_ghost_hits; + kstat_named_t arcstat_allocated; kstat_named_t arcstat_deleted; + kstat_named_t arcstat_stolen; kstat_named_t arcstat_recycle_miss; kstat_named_t arcstat_mutex_miss; kstat_named_t arcstat_evict_skip; @@ -280,6 +306,19 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_size; kstat_named_t arcstat_l2_hdr_size; kstat_named_t arcstat_memory_throttle_count; + kstat_named_t arcstat_l2_write_trylock_fail; + kstat_named_t arcstat_l2_write_in_l2; + kstat_named_t arcstat_l2_write_passed_headroom; + kstat_named_t arcstat_l2_write_spa_mismatch; + kstat_named_t arcstat_l2_write_hdr_io_in_progress; + kstat_named_t arcstat_l2_write_not_cacheable; + kstat_named_t arcstat_l2_write_full; + kstat_named_t arcstat_l2_write_buffer_iter; + kstat_named_t arcstat_l2_write_pios; + kstat_named_t arcstat_l2_write_bytes_written; + kstat_named_t arcstat_l2_write_buffer_bytes_scanned; + kstat_named_t arcstat_l2_write_buffer_list_iter; + kstat_named_t arcstat_l2_write_buffer_list_null_iter; } arc_stats_t; static arc_stats_t arc_stats = { @@ -297,7 +336,9 @@ static arc_stats_t arc_stats = { { "mru_ghost_hits", KSTAT_DATA_UINT64 }, { "mfu_hits", KSTAT_DATA_UINT64 }, { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, + { "allocated", KSTAT_DATA_UINT64 }, { "deleted", KSTAT_DATA_UINT64 }, + { "stolen", KSTAT_DATA_UINT64 }, { "recycle_miss", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "evict_skip", KSTAT_DATA_UINT64 }, @@ -328,7 +369,20 @@ static arc_stats_t arc_stats = { { "l2_io_error", KSTAT_DATA_UINT64 }, { "l2_size", KSTAT_DATA_UINT64 }, { "l2_hdr_size", KSTAT_DATA_UINT64 }, - { "memory_throttle_count", KSTAT_DATA_UINT64 } + { "memory_throttle_count", KSTAT_DATA_UINT64 }, + { "l2_write_trylock_fail", KSTAT_DATA_UINT64 }, + { "l2_write_in_l2", KSTAT_DATA_UINT64 }, + { "l2_write_passed_headroom", KSTAT_DATA_UINT64 }, + { "l2_write_spa_mismatch", KSTAT_DATA_UINT64 }, + { "l2_write_io_in_progress", KSTAT_DATA_UINT64 }, + { "l2_write_not_cacheable", KSTAT_DATA_UINT64 }, + { "l2_write_full", KSTAT_DATA_UINT64 }, + { "l2_write_buffer_iter", KSTAT_DATA_UINT64 }, + { "l2_write_pios", KSTAT_DATA_UINT64 }, + { "l2_write_bytes_written", KSTAT_DATA_UINT64 }, + { "l2_write_buffer_bytes_scanned", KSTAT_DATA_UINT64 }, + { "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 }, + { "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 } }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -550,9 +604,10 @@ extern kmem_cache_t *zio_data_buf_cache[ * Level 2 ARC */ -#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ -#define L2ARC_HEADROOM 4 /* num of writes */ +#define L2ARC_WRITE_SIZE (64 * 1024 * 1024) /* initial write max */ +#define L2ARC_HEADROOM 128 /* num of writes */ #define L2ARC_FEED_SECS 1 /* caching interval */ +#define L2ARC_FEED_SECS_SHIFT 1 /* caching interval shift */ #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) @@ -564,7 +619,66 @@ uint64_t l2arc_write_max = L2ARC_WRITE_S uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ -boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ +uint64_t l2arc_feed_secs_shift = L2ARC_FEED_SECS_SHIFT; /* interval seconds shift */ +boolean_t l2arc_noprefetch = B_FALSE; /* don't cache prefetch bufs */ + + +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, + &l2arc_write_max, 0, "max write size"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, + &l2arc_write_boost, 0, "extra write during warmup"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, + &l2arc_headroom, 0, "number of dev writes"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, + &l2arc_feed_secs, 0, "interval seconds"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs_shift, CTLFLAG_RW, + &l2arc_feed_secs_shift, 0, "power of 2 division of feed seconds"); + +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, + &l2arc_noprefetch, 0, "don't cache prefetch bufs"); + + +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, + &ARC_anon.arcs_size, 0, "size of anonymous state"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD, + &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD, + &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state"); + +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, + &ARC_mru.arcs_size, 0, "size of mru state"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD, + &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD, + &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state"); + +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, + &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, + "size of metadata in mru ghost state"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0, + "size of data in mru ghost state"); + +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, + &ARC_mfu.arcs_size, 0, "size of mfu state"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD, + &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD, + &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state"); + +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0, + "size of metadata in mfu ghost state"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0, + "size of data in mfu ghost state"); + +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, + &ARC_l2c_only.arcs_size, 0, "size of mru state"); /* * L2ARC Internals @@ -958,20 +1072,42 @@ arc_buf_freeze(arc_buf_t *buf) } static void +get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock) +{ + uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth); + + if (ab->b_type == ARC_BUFC_METADATA) + buf_hashid &= (ARC_BUFC_NUMMETADATALISTS-1); + else { + buf_hashid &= (ARC_BUFC_NUMDATALISTS-1); + buf_hashid += ARC_BUFC_NUMMETADATALISTS; + } + + *list = &state->arcs_lists[buf_hashid]; + *lock = ARCS_LOCK(state, buf_hashid); +} + + +static void add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) { + ASSERT(MUTEX_HELD(hash_lock)); if ((refcount_add(&ab->b_refcnt, tag) == 1) && (ab->b_state != arc_anon)) { + list_t *list; + kmutex_t *lock; uint64_t delta = ab->b_size * ab->b_datacnt; - list_t *list = &ab->b_state->arcs_list[ab->b_type]; uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; - ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); - mutex_enter(&ab->b_state->arcs_mtx); + get_buf_info(ab, ab->b_state, &list, &lock); + ASSERT(!MUTEX_HELD(lock)); + mutex_enter(lock); ASSERT(list_link_active(&ab->b_arc_node)); list_remove(list, ab); + mutex_exit(lock); + if (GHOST_STATE(ab->b_state)) { ASSERT3U(ab->b_datacnt, ==, 0); ASSERT3P(ab->b_buf, ==, NULL); @@ -980,7 +1116,6 @@ add_reference(arc_buf_hdr_t *ab, kmutex_ ASSERT(delta > 0); ASSERT3U(*size, >=, delta); atomic_add_64(size, -delta); - mutex_exit(&ab->b_state->arcs_mtx); /* remove the prefetch flag if we get a reference */ if (ab->b_flags & ARC_PREFETCH) ab->b_flags &= ~ARC_PREFETCH; @@ -999,14 +1134,19 @@ remove_reference(arc_buf_hdr_t *ab, kmut if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && (state != arc_anon)) { uint64_t *size = &state->arcs_lsize[ab->b_type]; + list_t *list; + kmutex_t *lock; + + get_buf_info(ab, state, &list, &lock); - ASSERT(!MUTEX_HELD(&state->arcs_mtx)); - mutex_enter(&state->arcs_mtx); + ASSERT(!MUTEX_HELD(lock)); + mutex_enter(lock); ASSERT(!list_link_active(&ab->b_arc_node)); - list_insert_head(&state->arcs_list[ab->b_type], ab); + list_insert_head(list, ab); + mutex_exit(lock); + ASSERT(ab->b_datacnt > 0); atomic_add_64(size, ab->b_size * ab->b_datacnt); - mutex_exit(&state->arcs_mtx); } return (cnt); } @@ -1021,6 +1161,8 @@ arc_change_state(arc_state_t *new_state, arc_state_t *old_state = ab->b_state; int64_t refcnt = refcount_count(&ab->b_refcnt); uint64_t from_delta, to_delta; + list_t *list; + kmutex_t *lock; ASSERT(MUTEX_HELD(hash_lock)); ASSERT(new_state != old_state); @@ -1035,14 +1177,17 @@ arc_change_state(arc_state_t *new_state, */ if (refcnt == 0) { if (old_state != arc_anon) { - int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); + int use_mutex; uint64_t *size = &old_state->arcs_lsize[ab->b_type]; + get_buf_info(ab, old_state, &list, &lock); + use_mutex = !MUTEX_HELD(lock); + if (use_mutex) - mutex_enter(&old_state->arcs_mtx); + mutex_enter(lock); ASSERT(list_link_active(&ab->b_arc_node)); - list_remove(&old_state->arcs_list[ab->b_type], ab); + list_remove(list, ab); /* * If prefetching out of the ghost cache, @@ -1057,16 +1202,20 @@ arc_change_state(arc_state_t *new_state, atomic_add_64(size, -from_delta); if (use_mutex) - mutex_exit(&old_state->arcs_mtx); + mutex_exit(lock); } if (new_state != arc_anon) { - int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); + int use_mutex; uint64_t *size = &new_state->arcs_lsize[ab->b_type]; + get_buf_info(ab, new_state, &list, &lock); + use_mutex = !MUTEX_HELD(lock); + + if (use_mutex) - mutex_enter(&new_state->arcs_mtx); + mutex_enter(lock); - list_insert_head(&new_state->arcs_list[ab->b_type], ab); + list_insert_head(list, ab); /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { @@ -1077,7 +1226,7 @@ arc_change_state(arc_state_t *new_state, atomic_add_64(size, to_delta); if (use_mutex) - mutex_exit(&new_state->arcs_mtx); + mutex_exit(lock); } } @@ -1467,21 +1616,49 @@ arc_evict(arc_state_t *state, spa_t *spa { arc_state_t *evicted_state; uint64_t bytes_evicted = 0, skipped = 0, missed = 0; + int64_t bytes_remaining; arc_buf_hdr_t *ab, *ab_prev = NULL; - list_t *list = &state->arcs_list[type]; + list_t *evicted_list, *list, *evicted_list_start, *list_start; + kmutex_t *lock, *evicted_lock; kmutex_t *hash_lock; boolean_t have_lock; void *stolen = NULL; + static int evict_metadata_offset, evict_data_offset; + int i, idx, offset, list_count, count; ASSERT(state == arc_mru || state == arc_mfu); evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; + + if (type == ARC_BUFC_METADATA) { + offset = 0; + list_count = ARC_BUFC_NUMMETADATALISTS; + list_start = &state->arcs_lists[0]; + evicted_list_start = &evicted_state->arcs_lists[0]; + idx = evict_metadata_offset; + } else { + offset = ARC_BUFC_NUMMETADATALISTS; + + list_start = &state->arcs_lists[offset]; + evicted_list_start = &evicted_state->arcs_lists[offset]; + list_count = ARC_BUFC_NUMDATALISTS; + idx = evict_data_offset; + } + bytes_remaining = evicted_state->arcs_lsize[type]; + count = 0; + +evict_start: + list = &list_start[idx]; + evicted_list = &evicted_list_start[idx]; + lock = ARCS_LOCK(state, (offset + idx)); + evicted_lock = ARCS_LOCK(evicted_state, (offset + idx)); - mutex_enter(&state->arcs_mtx); - mutex_enter(&evicted_state->arcs_mtx); + mutex_enter(lock); + mutex_enter(evicted_lock); for (ab = list_tail(list); ab; ab = ab_prev) { ab_prev = list_prev(list, ab); + bytes_remaining -= (ab->b_size * ab->b_datacnt); /* prefetch buffers have a minimum lifespan */ if (HDR_IO_IN_PROGRESS(ab) || (spa && ab->b_spa != spa) || @@ -1541,18 +1718,36 @@ arc_evict(arc_state_t *state, spa_t *spa mutex_exit(hash_lock); if (bytes >= 0 && bytes_evicted >= bytes) break; + if (bytes_remaining > 0) { + mutex_exit(evicted_lock); + mutex_exit(lock); + idx = ((idx + 1)&(list_count-1)); + count++; + goto evict_start; + } } else { missed += 1; } } - mutex_exit(&evicted_state->arcs_mtx); - mutex_exit(&state->arcs_mtx); - - if (bytes_evicted < bytes) - dprintf("only evicted %lld bytes from %x", - (longlong_t)bytes_evicted, state); + mutex_exit(evicted_lock); + mutex_exit(lock); + + idx = ((idx + 1)&(list_count-1)); + count++; + if (bytes_evicted < bytes) { + if (count < list_count) + goto evict_start; + else + dprintf("only evicted %lld bytes from %x", + (longlong_t)bytes_evicted, state); + } + if (type == ARC_BUFC_METADATA) + evict_metadata_offset = idx; + else + evict_data_offset = idx; + if (skipped) ARCSTAT_INCR(arcstat_evict_skip, skipped); @@ -1579,6 +1774,8 @@ arc_evict(arc_state_t *state, spa_t *spa arc_evict_ghost(arc_mfu_ghost, NULL, todelete); } } + if (stolen) + ARCSTAT_BUMP(arcstat_stolen); return (stolen); } @@ -1591,14 +1788,28 @@ static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes) { arc_buf_hdr_t *ab, *ab_prev; - list_t *list = &state->arcs_list[ARC_BUFC_DATA]; - kmutex_t *hash_lock; + list_t *list, *list_start; + kmutex_t *hash_lock, *lock; uint64_t bytes_deleted = 0; uint64_t bufs_skipped = 0; + static int evict_offset; + int list_count, idx = evict_offset; + int offset, count = 0; ASSERT(GHOST_STATE(state)); -top: - mutex_enter(&state->arcs_mtx); + + /* + * data lists come after metadata lists + */ + list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS]; + list_count = ARC_BUFC_NUMDATALISTS; + offset = ARC_BUFC_NUMMETADATALISTS; + +evict_start: + list = &list_start[idx]; + lock = ARCS_LOCK(state, idx + offset); + + mutex_enter(lock); for (ab = list_tail(list); ab; ab = ab_prev) { ab_prev = list_prev(list, ab); if (spa && ab->b_spa != spa) @@ -1628,20 +1839,31 @@ top: break; } else { if (bytes < 0) { - mutex_exit(&state->arcs_mtx); + /* + * we're draining the ARC, retry + */ + mutex_exit(lock); mutex_enter(hash_lock); mutex_exit(hash_lock); - goto top; + goto evict_start; } bufs_skipped += 1; } } - mutex_exit(&state->arcs_mtx); - - if (list == &state->arcs_list[ARC_BUFC_DATA] && + mutex_exit(lock); + idx = ((idx + 1)&(ARC_BUFC_NUMDATALISTS-1)); + count++; + + if (count < list_count) + goto evict_start; + + evict_offset = idx; + if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] && (bytes < 0 || bytes_deleted < bytes)) { - list = &state->arcs_list[ARC_BUFC_METADATA]; - goto top; + list_start = &state->arcs_lists[0]; + list_count = ARC_BUFC_NUMMETADATALISTS; + offset = count = 0; + goto evict_start; } if (bufs_skipped) { @@ -1755,22 +1977,22 @@ restart: void arc_flush(spa_t *spa) { - while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { + while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) { (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA); if (spa) break; } - while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { + while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) { (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA); if (spa) break; } - while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { + while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) { (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA); if (spa) break; } - while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { + while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) { (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA); if (spa) break; @@ -2206,6 +2428,7 @@ out: arc_anon->arcs_size + arc_mru->arcs_size > arc_p) arc_p = MIN(arc_c, arc_p + size); } + ARCSTAT_BUMP(arcstat_allocated); } /* @@ -2391,7 +2614,10 @@ arc_read_done(zio_t *zio) hdr->b_flags &= ~ARC_L2_EVICTED; if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) hdr->b_flags &= ~ARC_L2CACHE; - +#if 0 + else if ((hdr->b_flags & ARC_PREFETCH) == 0) + hdr->b_flags |= ARC_L2CACHE; +#endif /* byteswap if necessary */ callback_list = hdr->b_acb; ASSERT(callback_list != NULL); @@ -2505,6 +2731,7 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_ uint32_t *arc_flags, const zbookmark_t *zb) { int err; + arc_buf_hdr_t *hdr = pbuf->b_hdr; ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); @@ -2513,8 +2740,8 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_ err = arc_read_nolock(pio, spa, bp, done, private, priority, zio_flags, arc_flags, zb); + ASSERT3P(hdr, ==, pbuf->b_hdr); rw_exit(&pbuf->b_lock); - return (err); } @@ -2825,7 +3052,9 @@ arc_buf_evict(arc_buf_t *buf) arc_buf_hdr_t *hdr; kmutex_t *hash_lock; arc_buf_t **bufp; - + list_t *list, *evicted_list; + kmutex_t *lock, *evicted_lock; + rw_enter(&buf->b_lock, RW_WRITER); hdr = buf->b_hdr; if (hdr == NULL) { @@ -2873,16 +3102,18 @@ arc_buf_evict(arc_buf_t *buf) evicted_state = (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; - mutex_enter(&old_state->arcs_mtx); - mutex_enter(&evicted_state->arcs_mtx); + get_buf_info(hdr, old_state, &list, &lock); + get_buf_info(hdr, evicted_state, &evicted_list, &evicted_lock); + mutex_enter(lock); + mutex_enter(evicted_lock); arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); hdr->b_flags |= ARC_IN_HASH_TABLE; hdr->b_flags &= ~ARC_BUF_AVAILABLE; - mutex_exit(&evicted_state->arcs_mtx); - mutex_exit(&old_state->arcs_mtx); + mutex_exit(evicted_lock); + mutex_exit(lock); } mutex_exit(hash_lock); rw_exit(&buf->b_lock); @@ -3428,7 +3659,8 @@ void arc_init(void) { int prefetch_tunable_set = 0; - + int i; + mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL); @@ -3496,33 +3728,34 @@ arc_init(void) arc_l2c_only = &ARC_l2c_only; arc_size = 0; - mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); - - list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], - sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { + + mutex_init(&arc_anon->arcs_locks[i].arcs_lock, + NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mru->arcs_locks[i].arcs_lock, + NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock, + NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mfu->arcs_locks[i].arcs_lock, + NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock, + NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock, + NULL, MUTEX_DEFAULT, NULL); + + list_create(&arc_mru->arcs_lists[i], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mru_ghost->arcs_lists[i], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu->arcs_lists[i], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu_ghost->arcs_lists[i], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_mfu_ghost->arcs_lists[i], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_l2c_only->arcs_lists[i], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + } buf_init(); @@ -3596,7 +3829,8 @@ arc_init(void) void arc_fini(void) { - + int i; + mutex_enter(&arc_reclaim_thr_lock); arc_thread_exit = 1; cv_signal(&arc_reclaim_thr_cv); @@ -3617,21 +3851,19 @@ arc_fini(void) mutex_destroy(&arc_reclaim_thr_lock); cv_destroy(&arc_reclaim_thr_cv); - list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); - list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); - list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); - list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); - list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); - - mutex_destroy(&arc_anon->arcs_mtx); - mutex_destroy(&arc_mru->arcs_mtx); - mutex_destroy(&arc_mru_ghost->arcs_mtx); - mutex_destroy(&arc_mfu->arcs_mtx); - mutex_destroy(&arc_mfu_ghost->arcs_mtx); - + for (i = 0; i < ARC_BUFC_NUMLISTS; i++) { + list_destroy(&arc_mru->arcs_lists[i]); + list_destroy(&arc_mru_ghost->arcs_lists[i]); + list_destroy(&arc_mfu->arcs_lists[i]); + list_destroy(&arc_mfu_ghost->arcs_lists[i]); + + mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock); + mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock); + mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock); + mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock); + mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock); + } + mutex_destroy(&zfs_write_limit_lock); buf_fini(); @@ -4026,28 +4258,31 @@ static list_t * l2arc_list_locked(int list_num, kmutex_t **lock) { list_t *list; + int idx; + + ASSERT(list_num >= 0 && list_num < 2*ARC_BUFC_NUMLISTS); - ASSERT(list_num >= 0 && list_num <= 3); - - switch (list_num) { - case 0: - list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; - *lock = &arc_mfu->arcs_mtx; - break; - case 1: - list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; - *lock = &arc_mru->arcs_mtx; - break; - case 2: - list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; - *lock = &arc_mfu->arcs_mtx; - break; - case 3: - list = &arc_mru->arcs_list[ARC_BUFC_DATA]; - *lock = &arc_mru->arcs_mtx; - break; + if (list_num < ARC_BUFC_NUMMETADATALISTS) { + idx = list_num; + list = &arc_mfu->arcs_lists[idx]; + *lock = ARCS_LOCK(arc_mfu, idx); + } else if (list_num < ARC_BUFC_NUMMETADATALISTS*2) { + idx = list_num - ARC_BUFC_NUMMETADATALISTS; + list = &arc_mru->arcs_lists[idx]; + *lock = ARCS_LOCK(arc_mru, idx); + } else if (list_num < (ARC_BUFC_NUMMETADATALISTS*2 + + ARC_BUFC_NUMDATALISTS)) { + idx = list_num - ARC_BUFC_NUMMETADATALISTS; + list = &arc_mfu->arcs_lists[idx]; + *lock = ARCS_LOCK(arc_mfu, idx); + } else { + idx = list_num - ARC_BUFC_NUMLISTS; + list = &arc_mru->arcs_lists[idx]; + *lock = ARCS_LOCK(arc_mru, idx); } + CTR3(KTR_SPARE2, "list=%p list_num=%d idx=%d", + list, list_num, idx); ASSERT(!(MUTEX_HELD(*lock))); mutex_enter(*lock); return (list); @@ -4212,13 +4447,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_de head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); head->b_flags |= ARC_L2_WRITE_HEAD; + ARCSTAT_BUMP(arcstat_l2_write_buffer_iter); /* * Copy buffers for L2ARC writing. */ mutex_enter(&l2arc_buflist_mtx); - for (try = 0; try <= 3; try++) { + for (try = 0; try < 2*ARC_BUFC_NUMLISTS; try++) { list = l2arc_list_locked(try, &list_lock); passed_sz = 0; + ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter); /* * L2ARC fast warmup. @@ -4231,52 +4468,66 @@ l2arc_write_buffers(spa_t *spa, l2arc_de ab = list_head(list); else ab = list_tail(list); + if (ab == NULL) { + ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter); + } for (; ab; ab = ab_prev) { if (arc_warm == B_FALSE) ab_prev = list_next(list, ab); else ab_prev = list_prev(list, ab); - + ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size); + hash_lock = HDR_LOCK(ab); have_lock = MUTEX_HELD(hash_lock); if (!have_lock && !mutex_tryenter(hash_lock)) { + ARCSTAT_BUMP(arcstat_l2_write_trylock_fail); /* * Skip this buffer rather than waiting. */ continue; } + if (ab->b_l2hdr != NULL) { + /* + * Already in L2ARC. + */ + mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_l2_write_in_l2); + continue; + } + passed_sz += ab->b_size; if (passed_sz > headroom) { /* * Searched too far. */ mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_l2_write_passed_headroom); break; } if (ab->b_spa != spa) { mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch); continue; } - if (ab->b_l2hdr != NULL) { - /* - * Already in L2ARC. - */ + if (HDR_IO_IN_PROGRESS(ab)) { mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress); continue; } - - if (HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) { + if (!HDR_L2CACHE(ab)) { mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_l2_write_not_cacheable); continue; } - if ((write_sz + ab->b_size) > target_sz) { full = B_TRUE; mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_l2_write_full); break; } @@ -4300,8 +4551,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_de cb->l2wcb_head = head; pio = zio_root(spa, l2arc_write_done, cb, ZIO_FLAG_CANFAIL); + ARCSTAT_BUMP(arcstat_l2_write_pios); } + ARCSTAT_INCR(arcstat_l2_write_bytes_written, ab->b_size); /* * Create and add a new L2ARC header. */ @@ -4309,7 +4562,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_de hdrl2->b_dev = dev; hdrl2->b_daddr = dev->l2ad_hand; - ab->b_flags |= ARC_L2_WRITING; ab->b_l2hdr = hdrl2; list_insert_head(dev->l2ad_buflist, ab); buf_data = ab->b_buf->b_data; @@ -4397,7 +4649,7 @@ l2arc_feed_thread(void *dummy __unused) */ CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, - hz * l2arc_feed_secs); + hz * l2arc_feed_secs >> l2arc_feed_secs_shift); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); /* Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h Tue Mar 16 21:44:21 2010 (r205230) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h Tue Mar 16 22:17:21 2010 (r205231) @@ -55,8 +55,8 @@ struct arc_buf { }; typedef enum arc_buf_contents { - ARC_BUFC_DATA, /* buffer contains data */ ARC_BUFC_METADATA, /* buffer contains metadata */ + ARC_BUFC_DATA, /* buffer contains data */ ARC_BUFC_NUMTYPES } arc_buf_contents_t; /*
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201003162217.o2GMHMjU012285>