From owner-svn-src-user@FreeBSD.ORG Wed Mar 18 01:04:18 2015 Return-Path: Delivered-To: svn-src-user@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) (using TLSv1.2 with cipher AECDH-AES256-SHA (256/256 bits)) (No client certificate requested) by hub.freebsd.org (Postfix) with ESMTPS id 56A41469; Wed, 18 Mar 2015 01:04:18 +0000 (UTC) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id 40382BC8; Wed, 18 Mar 2015 01:04:18 +0000 (UTC) Received: from svn.freebsd.org ([127.0.1.70]) by svn.freebsd.org (8.14.9/8.14.9) with ESMTP id t2I14HR7025314; Wed, 18 Mar 2015 01:04:17 GMT (envelope-from delphij@FreeBSD.org) Received: (from delphij@localhost) by svn.freebsd.org (8.14.9/8.14.9/Submit) id t2I14HZO025310; Wed, 18 Mar 2015 01:04:17 GMT (envelope-from delphij@FreeBSD.org) Message-Id: <201503180104.t2I14HZO025310@svn.freebsd.org> X-Authentication-Warning: svn.freebsd.org: delphij set sender to delphij@FreeBSD.org using -f From: Xin LI Date: Wed, 18 Mar 2015 01:04:17 +0000 (UTC) To: src-committers@freebsd.org, svn-src-user@freebsd.org Subject: svn commit: r280198 - in user/delphij/zfs-arc-rebase/sys/cddl: compat/opensolaris/sys contrib/opensolaris/uts/common/fs/zfs X-SVN-Group: user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-user@freebsd.org X-Mailman-Version: 2.1.18-1 Precedence: list List-Id: "SVN commit messages for the experimental " user" src tree" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 18 Mar 2015 01:04:18 -0000 Author: delphij Date: Wed Mar 18 01:04:16 2015 New Revision: 280198 URL: https://svnweb.freebsd.org/changeset/base/280198 Log: Initial round of MFV r277425. - Ensure that kmem_cache_reap_now() do not result in clearing arc_no_grow. - Simplify arc_lowmem() a bit and remove arc_lowmem_lock, as arc_reclaim_thr_lock already do the required serialization. Modified: user/delphij/zfs-arc-rebase/sys/cddl/compat/opensolaris/sys/kmem.h user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Directory Properties: user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/ (props changed) Modified: user/delphij/zfs-arc-rebase/sys/cddl/compat/opensolaris/sys/kmem.h ============================================================================== --- user/delphij/zfs-arc-rebase/sys/cddl/compat/opensolaris/sys/kmem.h Tue Mar 17 22:40:50 2015 (r280197) +++ user/delphij/zfs-arc-rebase/sys/cddl/compat/opensolaris/sys/kmem.h Wed Mar 18 01:04:16 2015 (r280198) @@ -78,6 +78,7 @@ int kmem_debugging(void); void *calloc(size_t n, size_t s); #define freemem (vm_cnt.v_free_count + vm_cnt.v_cache_count) +#define desfree vm_cnt.v_free_target #define minfree vm_cnt.v_free_min #define heap_arena kmem_arena #define kmem_alloc(size, kmflags) zfs_kmem_alloc((size), (kmflags)) Modified: user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c ============================================================================== --- user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Tue Mar 17 22:40:50 2015 (r280197) +++ user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Wed Mar 18 01:04:16 2015 (r280198) @@ -153,13 +153,7 @@ static kmutex_t arc_reclaim_thr_lock; static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ static uint8_t arc_thread_exit; -#define ARC_REDUCE_DNLC_PERCENT 3 -uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; - -typedef enum arc_reclaim_strategy { - ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ - ARC_RECLAIM_CONS /* Conservative reclaim strategy */ -} arc_reclaim_strategy_t; +uint_t arc_reduce_dnlc_percent = 3; /* * The number of iterations through arc_evict_*() before we @@ -174,7 +168,19 @@ static int arc_grow_retry = 60; static int arc_p_min_shift = 4; /* log2(fraction of arc to reclaim) */ -static int arc_shrink_shift = 5; +static int arc_shrink_shift = 7; + +/* + * log2(fraction of ARC which must be free to allow growing). + * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, + * when reading a new block into the ARC, we will evict an equal-sized block + * from the ARC. + * + * This must be less than arc_shrink_shift, so that when we shrink the ARC, + * we will still not allow it to grow. + */ +int arc_no_grow_shift = 5; + /* * minimum lifespan of a prefetch block in clock ticks @@ -2426,12 +2432,10 @@ arc_flush(spa_t *spa) } void -arc_shrink(void) +arc_shrink(int64_t to_free) { if (arc_c > arc_c_min) { - uint64_t to_free; - to_free = arc_c >> arc_shrink_shift; DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t, arc_c_min, uint64_t, arc_p, uint64_t, to_free); if (arc_c > arc_c_min + to_free) @@ -2461,50 +2465,69 @@ arc_shrink(void) static int needfree = 0; +typedef enum free_memory_reason_t { + FMR_UNKNOWN, + FMR_NEEDFREE, + FMR_LOTSFREE, + FMR_SWAPFS_MINFREE, + FMR_PAGES_PP_MAXIMUM, + FMR_HEAP_ARENA, + FMR_ZIO_ARENA, +} free_memory_reason_t; + +int64_t last_free_memory; +free_memory_reason_t last_free_reason; + /* - * Determine if the system is under memory pressure and is asking - * to reclaim memory. A return value of 1 indicates that the system - * is under memory pressure and that the arc should adjust accordingly. + * Additional reserve of pages for pp_reserve. */ -static int -arc_reclaim_needed(void) -{ -#ifdef illumos - uint64_t extra; -#endif +int64_t arc_pages_pp_reserve = 64; + +/* + * Additional reserve of pages for swapfs. + */ +int64_t arc_swapfs_reserve = 64; +/* + * Return the amount of memory that can be consumed before reclaim will be + * needed. Positive if there is sufficient free memory, negative indicates + * the amount of memory that needs to be freed up. + */ +static int64_t +arc_available_memory(void) +{ + int64_t lowest = INT64_MAX; + int64_t n; + free_memory_reason_t r = FMR_UNKNOWN; #ifdef _KERNEL - if (needfree) { - DTRACE_PROBE(arc__reclaim_needfree); - return (1); + if (needfree > 0) { + n = PAGESIZE * (-needfree); + if (n < lowest) { + lowest = n; + r = FMR_NEEDFREE; + } } - /* - * Cooperate with pagedaemon when it's time for it to scan - * and reclaim some pages. - */ - if (freemem < zfs_arc_free_target) { - DTRACE_PROBE2(arc__reclaim_freemem, uint64_t, - freemem, uint64_t, zfs_arc_free_target); - return (1); + n = PAGESIZE * (freemem - minfree - desfree); + if (n < lowest) { + lowest = n; + r = FMR_LOTSFREE; } #ifdef illumos /* - * take 'desfree' extra pages, so we reclaim sooner, rather than later - */ - extra = desfree; - - /* * check that we're out of range of the pageout scanner. It starts to * schedule paging if freemem is less than lotsfree and needfree. * lotsfree is the high-water mark for pageout, and needfree is the * number of needed free pages. We add extra pages here to make sure * the scanner doesn't start up while we're freeing memory. */ - if (freemem < lotsfree + needfree + extra) - return (1); + n = PAGESIZE * (freemem - lotsfree - needfree - desfree); + if (n < lowest) { + lowest = n; + r = FMR_LOTSFREE; + } /* * check to make sure that swapfs has enough space so that anon @@ -2513,8 +2536,12 @@ arc_reclaim_needed(void) * swap pages. We also add a bit of extra here just to prevent * circumstances from getting really dire. */ - if (availrmem < swapfs_minfree + swapfs_reserve + extra) - return (1); + n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - + desfree - arc_swapfs_reserve); + if (n < lowest) { + lowest = n; + r = FMR_SWAPFS_MINFREE; + } /* * Check that we have enough availrmem that memory locking (e.g., via @@ -2523,10 +2550,14 @@ arc_reclaim_needed(void) * drops below pages_pp_maximum, page locking mechanisms such as * page_pp_lock() will fail.) */ - if (availrmem <= pages_pp_maximum) - return (1); + n = PAGESIZE * (availrmem - pages_pp_maximum - + arc_pages_pp_reserve); + if (n < lowest) { + lowest = n; + r = FMR_PAGES_PP_MAXIMUM; + } +#endif -#endif /* illumos */ #if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) /* * If we're on an i386 platform, it's possible that we'll exhaust the @@ -2539,12 +2570,14 @@ arc_reclaim_needed(void) * heap is allocated. (Or, in the calculation, if less than 1/4th is * free) */ - if (vmem_size(heap_arena, VMEM_FREE) < - (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) { + n = vmem_size(heap_arena, VMEM_FREE) - + (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); + if (n < lowest) { + lowest = n; + r = FMR_HEAP_ARENA; DTRACE_PROBE2(arc__reclaim_used, uint64_t, vmem_size(heap_arena, VMEM_FREE), uint64_t, (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2); - return (1); } #endif #ifdef illumos @@ -2557,26 +2590,46 @@ arc_reclaim_needed(void) * to aggressively evict memory from the arc in order to avoid * memory fragmentation issues. */ - if (zio_arena != NULL && - vmem_size(zio_arena, VMEM_FREE) < - (vmem_size(zio_arena, VMEM_ALLOC) >> 4)) - return (1); + if (zio_arena != NULL) { + n = vmem_size(zio_arena, VMEM_FREE) - + (vmem_size(zio_arena, VMEM_ALLOC) >> 4); + if (n < lowest) { + lowest = n; + r = FMR_ZIO_ARENA; + } + } #endif /* illumos */ #else /* _KERNEL */ + /* Every 100 calls, free a small amount */ if (spa_get_random(100) == 0) - return (1); + lowest = -1024; #endif /* _KERNEL */ DTRACE_PROBE(arc__reclaim_no); - return (0); + + last_free_memory = lowest; + last_free_reason = r; + + return (lowest); } extern kmem_cache_t *zio_buf_cache[]; extern kmem_cache_t *zio_data_buf_cache[]; extern kmem_cache_t *range_seg_cache; +/* + * Determine if the system is under memory pressure and is asking + * to reclaim memory. A return value of TRUE indicates that the system + * is under memory pressure and that the arc should adjust accordingly. + */ +static boolean_t +arc_reclaim_needed(void) +{ + return (arc_available_memory() < 0); +} + static void __noinline -arc_kmem_reap_now(arc_reclaim_strategy_t strat) +arc_kmem_reap_now(void) { size_t i; kmem_cache_t *prev_cache = NULL; @@ -2599,13 +2652,6 @@ arc_kmem_reap_now(arc_reclaim_strategy_t #endif #endif - /* - * An aggressive reclamation will shrink the cache size as well as - * reap free buffers from the arc kmem caches. - */ - if (strat == ARC_RECLAIM_AGGR) - arc_shrink(); - for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { if (zio_buf_cache[i] != prev_cache) { prev_cache = zio_buf_cache[i]; @@ -2621,12 +2667,13 @@ arc_kmem_reap_now(arc_reclaim_strategy_t kmem_cache_reap_now(range_seg_cache); #ifdef illumos - /* - * Ask the vmem arena to reclaim unused memory from its - * quantum caches. - */ - if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR) + if (zio_arena != NULL) { + /* + * Ask the vmem arena to reclaim unused memory from its + * quantum caches. + */ vmem_qcache_reap(zio_arena); + } #endif DTRACE_PROBE(arc__kmem_reap_end); } @@ -2635,46 +2682,44 @@ static void arc_reclaim_thread(void *dummy __unused) { clock_t growtime = 0; - arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; callb_cpr_t cpr; CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&arc_reclaim_thr_lock); while (arc_thread_exit == 0) { - if (arc_reclaim_needed()) { + int64_t free_memory = arc_available_memory(); + if (free_memory < 0) { - if (arc_no_grow) { - if (last_reclaim == ARC_RECLAIM_CONS) { - DTRACE_PROBE(arc__reclaim_aggr_no_grow); - last_reclaim = ARC_RECLAIM_AGGR; - } else { - last_reclaim = ARC_RECLAIM_CONS; - } - } else { - arc_no_grow = TRUE; - last_reclaim = ARC_RECLAIM_AGGR; - DTRACE_PROBE(arc__reclaim_aggr); - membar_producer(); - } + arc_no_grow = B_TRUE; + arc_warm = B_TRUE; - /* reset the growth delay for every reclaim */ + /* + * Wait at least zfs_grow_retry (default 60) seconds + * before considering growing. + */ growtime = ddi_get_lbolt() + (arc_grow_retry * hz); - if (needfree && last_reclaim == ARC_RECLAIM_CONS) { - /* - * If needfree is TRUE our vm_lowmem hook - * was called and in that case we must free some - * memory, so switch to aggressive mode. - */ - arc_no_grow = TRUE; - last_reclaim = ARC_RECLAIM_AGGR; - } - arc_kmem_reap_now(last_reclaim); - arc_warm = B_TRUE; + arc_kmem_reap_now(); + + /* + * If we are still low on memory, shrink the ARC + * so that we have arc_shrink_min free space. + */ + free_memory = arc_available_memory(); - } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { - arc_no_grow = FALSE; + int64_t to_free = + (arc_c >> arc_shrink_shift) - free_memory; + if (to_free > 0) { +#ifdef _KERNEL + to_free = MAX(to_free, ptob(needfree)); +#endif + arc_shrink(to_free); + } + } else if (free_memory < arc_c >> arc_no_grow_shift) { + arc_no_grow = B_TRUE; + } else if (ddi_get_lbolt() >= growtime) { + arc_no_grow = B_FALSE; } arc_adjust(); @@ -4035,7 +4080,6 @@ arc_tempreserve_space(uint64_t reserve, return (0); } -static kmutex_t arc_lowmem_lock; #ifdef _KERNEL static eventhandler_tag arc_event_lowmem = NULL; @@ -4043,41 +4087,48 @@ static void arc_lowmem(void *arg __unused, int howto __unused) { - /* Serialize access via arc_lowmem_lock. */ - mutex_enter(&arc_lowmem_lock); mutex_enter(&arc_reclaim_thr_lock); - needfree = 1; + needfree = 1024; DTRACE_PROBE(arc__needfree); cv_signal(&arc_reclaim_thr_cv); /* - * It is unsafe to block here in arbitrary threads, because we can come - * here from ARC itself and may hold ARC locks and thus risk a deadlock - * with ARC reclaim thread. + * Debugging aid: indicate that we are waiting for ZFS to free up memory. */ if (curproc == pageproc) { while (needfree) msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0); } mutex_exit(&arc_reclaim_thr_lock); - mutex_exit(&arc_lowmem_lock); } #endif void arc_init(void) { - int i, prefetch_tunable_set = 0; + int prefetch_tunable_set = 0; + + /* + * allmem is "all memory that we could possibly use". + */ +#ifdef illumos +#ifdef _KERNEL + uint64_t allmem = ptob(physmem - swapfs_minfree); +#else + uint64_t allmem = (physmem * PAGESIZE) / 2; +#endif +#else + uint64_t allmem = kmem_size(); +#endif mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL); /* Convert seconds to clock ticks */ arc_min_prefetch_lifespan = 1 * hz; /* Start out with 1/8 of all memory */ - arc_c = kmem_size() / 8; + arc_c = allmem / 8; #ifdef illumos #ifdef _KERNEL @@ -4089,23 +4140,24 @@ arc_init(void) arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); #endif #endif /* illumos */ + /* set min cache to 1/32 of all memory, or 16MB, whichever is more */ - arc_c_min = MAX(arc_c / 4, 64<<18); - /* set max to 1/2 of all memory, or all but 1GB, whichever is more */ - if (arc_c * 8 >= 1<<30) - arc_c_max = (arc_c * 8) - (1<<30); + arc_c_min = MAX(allmem / 32, 64 << 18); + /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ + if (allmem >= 1 << 30) + arc_c_max = allmem - (1 << 30); else arc_c_max = arc_c_min; - arc_c_max = MAX(arc_c * 5, arc_c_max); + arc_c_max = MAX(allmem * 5 / 8, arc_c_max); #ifdef _KERNEL /* * Allow the tunables to override our calculations if they are * reasonable (ie. over 16MB) */ - if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size()) + if (zfs_arc_max > 64 << 18 && zfs_arc_max < allmem) arc_c_max = zfs_arc_max; - if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max) + if (zfs_arc_min > 64 << 18 && zfs_arc_min <= arc_c_max) arc_c_min = zfs_arc_min; #endif @@ -4134,6 +4186,12 @@ arc_init(void) if (zfs_arc_shrink_shift > 0) arc_shrink_shift = zfs_arc_shrink_shift; + /* + * Ensure that arc_no_grow_shift is less than arc_shrink_shift. + */ + if (arc_no_grow_shift >= arc_shrink_shift) + arc_no_grow_shift = arc_shrink_shift - 1; + if (zfs_arc_p_min_shift > 0) arc_p_min_shift = zfs_arc_p_min_shift; @@ -4303,7 +4361,6 @@ arc_fini(void) ASSERT(arc_loaned_bytes == 0); - mutex_destroy(&arc_lowmem_lock); #ifdef _KERNEL if (arc_event_lowmem != NULL) EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);