Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 3 Oct 2014 20:34:56 +0000 (UTC)
From:      Steven Hartland <smh@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r272483 - in head/sys: cddl/compat/opensolaris/kern cddl/compat/opensolaris/sys cddl/contrib/opensolaris/uts/common/fs/zfs vm
Message-ID:  <201410032034.s93KYuUD075578@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: smh
Date: Fri Oct  3 20:34:55 2014
New Revision: 272483
URL: https://svnweb.freebsd.org/changeset/base/272483

Log:
  Refactor ZFS ARC reclaim checks and limits
  
  Remove previously added kmem methods in favour of defines which
  allow diff minimisation between upstream code base.
  
  Rebalance ARC free target to be vm_pageout_wakeup_thresh by default
  which eliminates issue where ARC gets minimised instead of balancing
  with VM pageout. The restores the target point prior to r270759.
  
  Bring in missing upstream only changes which move unused code to
  further eliminate code differences.
  
  Add additional DTRACE probe to aid monitoring of ARC behaviour.
  
  Enable upstream i386 code paths on platforms which don't define
  UMA_MD_SMALL_ALLOC.
  
  Fix mixture of byte an page values in arc_memory_throttle i386 code
  path value assignment of available_memory.
  
  PR:		187594
  Review:		D702
  Reviewed by:	avg
  MFC after:	1 week
  X-MFC-With:	r270759 & r270861
  Sponsored by:	Multiplay

Modified:
  head/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c
  head/sys/cddl/compat/opensolaris/sys/kmem.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
  head/sys/vm/vm_pageout.c

Modified: head/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c
==============================================================================
--- head/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c	Fri Oct  3 20:24:56 2014	(r272482)
+++ head/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c	Fri Oct  3 20:34:55 2014	(r272483)
@@ -126,42 +126,6 @@ kmem_size_init(void *unused __unused)
 }
 SYSINIT(kmem_size_init, SI_SUB_KMEM, SI_ORDER_ANY, kmem_size_init, NULL);
 
-/*
- * The return values from kmem_free_* are only valid once the pagedaemon
- * has been initialised, before then they return 0.
- * 
- * To ensure the returns are valid the caller can use a SYSINIT with
- * subsystem set to SI_SUB_KTHREAD_PAGE and an order of at least
- * SI_ORDER_SECOND.
- */
-u_int
-kmem_free_target(void)
-{
-
-	return (vm_cnt.v_free_target);
-}
-
-u_int
-kmem_free_min(void)
-{
-
-	return (vm_cnt.v_free_min);
-}
-
-u_int
-kmem_free_count(void)
-{
-
-	return (vm_cnt.v_free_count + vm_cnt.v_cache_count);
-}
-
-u_int
-kmem_page_count(void)
-{
-
-	return (vm_cnt.v_page_count);
-}
-
 uint64_t
 kmem_size(void)
 {
@@ -169,13 +133,6 @@ kmem_size(void)
 	return (kmem_size_val);
 }
 
-uint64_t
-kmem_used(void)
-{
-
-	return (vmem_size(kmem_arena, VMEM_ALLOC));
-}
-
 static int
 kmem_std_constructor(void *mem, int size __unused, void *private, int flags)
 {

Modified: head/sys/cddl/compat/opensolaris/sys/kmem.h
==============================================================================
--- head/sys/cddl/compat/opensolaris/sys/kmem.h	Fri Oct  3 20:24:56 2014	(r272482)
+++ head/sys/cddl/compat/opensolaris/sys/kmem.h	Fri Oct  3 20:34:55 2014	(r272483)
@@ -66,17 +66,6 @@ typedef struct kmem_cache {
 void *zfs_kmem_alloc(size_t size, int kmflags);
 void zfs_kmem_free(void *buf, size_t size);
 uint64_t kmem_size(void);
-uint64_t kmem_used(void);
-u_int kmem_page_count(void);
-
-/*
- * The return values from kmem_free_* are only valid once the pagedaemon
- * has been initialised, before then they return 0.
- */
-u_int kmem_free_count(void);
-u_int kmem_free_target(void);
-u_int kmem_free_min(void);
-
 kmem_cache_t *kmem_cache_create(char *name, size_t bufsize, size_t align,
     int (*constructor)(void *, void *, int), void (*destructor)(void *, void *),
     void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags);
@@ -88,6 +77,9 @@ void kmem_reap(void);
 int kmem_debugging(void);
 void *calloc(size_t n, size_t s);
 
+#define	freemem				(vm_cnt.v_free_count + vm_cnt.v_cache_count)
+#define	minfree				vm_cnt.v_free_min
+#define	heap_arena			kmem_arena
 #define	kmem_alloc(size, kmflags)	zfs_kmem_alloc((size), (kmflags))
 #define	kmem_zalloc(size, kmflags)	zfs_kmem_alloc((size), (kmflags) | M_ZERO)
 #define	kmem_free(buf, size)		zfs_kmem_free((buf), (size))

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	Fri Oct  3 20:24:56 2014	(r272482)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	Fri Oct  3 20:34:55 2014	(r272483)
@@ -138,6 +138,7 @@
 #include <sys/sdt.h>
 
 #include <vm/vm_pageout.h>
+#include <machine/vmparam.h>
 
 #ifdef illumos
 #ifndef _KERNEL
@@ -201,7 +202,7 @@ int zfs_arc_shrink_shift = 0;
 int zfs_arc_p_min_shift = 0;
 int zfs_disable_dup_eviction = 0;
 uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
-u_int zfs_arc_free_target = (1 << 19); /* default before pagedaemon init only */
+u_int zfs_arc_free_target = 0;
 
 static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
 
@@ -210,11 +211,10 @@ static void
 arc_free_target_init(void *unused __unused)
 {
 
-	zfs_arc_free_target = kmem_free_target();
+	zfs_arc_free_target = vm_pageout_wakeup_thresh;
 }
 SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
     arc_free_target_init, NULL);
-#endif
 
 TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
 SYSCTL_DECL(_vfs_zfs);
@@ -245,15 +245,16 @@ sysctl_vfs_zfs_arc_free_target(SYSCTL_HA
 	if (err != 0 || req->newptr == NULL)
 		return (err);
 
-	if (val < kmem_free_min())
+	if (val < minfree)
 		return (EINVAL);
-	if (val > kmem_page_count())
+	if (val > vm_cnt.v_page_count)
 		return (EINVAL);
 
 	zfs_arc_free_target = val;
 
 	return (0);
 }
+#endif
 
 /*
  * Note that buffers can be in one of 6 states:
@@ -2445,8 +2446,8 @@ arc_shrink(void)
 	if (arc_c > arc_c_min) {
 		uint64_t to_free;
 
-		DTRACE_PROBE2(arc__shrink, uint64_t, arc_c, uint64_t,
-			arc_c_min);
+		DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
+			arc_c_min, uint64_t, arc_p, uint64_t, to_free);
 #ifdef _KERNEL
 		to_free = arc_c >> arc_shrink_shift;
 #else
@@ -2462,6 +2463,10 @@ arc_shrink(void)
 			arc_c = MAX(arc_size, arc_c_min);
 		if (arc_p > arc_c)
 			arc_p = (arc_c >> 1);
+
+		DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
+			arc_p);
+
 		ASSERT(arc_c >= arc_c_min);
 		ASSERT((int64_t)arc_p >= 0);
 	}
@@ -2486,18 +2491,13 @@ arc_reclaim_needed(void)
 		return (1);
 	}
 
-	if (kmem_free_count() < zfs_arc_free_target) {
-		DTRACE_PROBE2(arc__reclaim_freetarget, uint64_t,
-		    kmem_free_count(), uint64_t, zfs_arc_free_target);
-		return (1);
-	}
-
 	/*
 	 * Cooperate with pagedaemon when it's time for it to scan
 	 * and reclaim some pages.
 	 */
-	if (vm_paging_needed()) {
-		DTRACE_PROBE(arc__reclaim_paging);
+	if (freemem < zfs_arc_free_target) {
+		DTRACE_PROBE2(arc__reclaim_freemem, uint64_t,
+		    freemem, uint64_t, zfs_arc_free_target);
 		return (1);
 	}
 
@@ -2527,7 +2527,18 @@ arc_reclaim_needed(void)
 	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
 		return (1);
 
-#if defined(__i386)
+	/*
+	 * Check that we have enough availrmem that memory locking (e.g., via
+	 * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
+	 * stores the number of pages that cannot be locked; when availrmem
+	 * drops below pages_pp_maximum, page locking mechanisms such as
+	 * page_pp_lock() will fail.)
+	 */
+	if (availrmem <= pages_pp_maximum)
+		return (1);
+
+#endif	/* sun */
+#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
 	/*
 	 * If we're on an i386 platform, it's possible that we'll exhaust the
 	 * kernel heap space before we ever run out of available physical
@@ -2539,25 +2550,33 @@ arc_reclaim_needed(void)
 	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
 	 * free)
 	 */
-	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
-	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
-		return (1);
-#endif
-#else	/* sun */
-#ifdef __i386__
-	/* i386 has KVA limits that the raw page counts above don't consider */
-	if (kmem_used() > (kmem_size() * 3) / 4) {
+	if (vmem_size(heap_arena, VMEM_FREE) <
+	    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2)) {
 		DTRACE_PROBE2(arc__reclaim_used, uint64_t,
-		    kmem_used(), uint64_t, (kmem_size() * 3) / 4);
+		    vmem_size(heap_arena, VMEM_FREE), uint64_t,
+		    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2);
 		return (1);
 	}
 #endif
+#ifdef sun
+	/*
+	 * If zio data pages are being allocated out of a separate heap segment,
+	 * then enforce that the size of available vmem for this arena remains
+	 * above about 1/16th free.
+	 *
+	 * Note: The 1/16th arena free requirement was put in place
+	 * to aggressively evict memory from the arc in order to avoid
+	 * memory fragmentation issues.
+	 */
+	if (zio_arena != NULL &&
+	    vmem_size(zio_arena, VMEM_FREE) <
+	    (vmem_size(zio_arena, VMEM_ALLOC) >> 4))
+		return (1);
 #endif	/* sun */
-
-#else
+#else	/* _KERNEL */
 	if (spa_get_random(100) == 0)
 		return (1);
-#endif
+#endif	/* _KERNEL */
 	DTRACE_PROBE(arc__reclaim_no);
 
 	return (0);
@@ -2566,13 +2585,14 @@ arc_reclaim_needed(void)
 extern kmem_cache_t	*zio_buf_cache[];
 extern kmem_cache_t	*zio_data_buf_cache[];
 
-static void
+static void __noinline
 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
 {
 	size_t			i;
 	kmem_cache_t		*prev_cache = NULL;
 	kmem_cache_t		*prev_data_cache = NULL;
 
+	DTRACE_PROBE(arc__kmem_reap_start);
 #ifdef _KERNEL
 	if (arc_meta_used >= arc_meta_limit) {
 		/*
@@ -2608,6 +2628,16 @@ arc_kmem_reap_now(arc_reclaim_strategy_t
 	}
 	kmem_cache_reap_now(buf_cache);
 	kmem_cache_reap_now(hdr_cache);
+
+#ifdef sun
+	/*
+	 * Ask the vmem arena to reclaim unused memory from its
+	 * quantum caches.
+	 */
+	if (zio_arena != NULL && strat == ARC_RECLAIM_AGGR)
+		vmem_qcache_reap(zio_arena);
+#endif
+	DTRACE_PROBE(arc__kmem_reap_end);
 }
 
 static void
@@ -2625,6 +2655,7 @@ arc_reclaim_thread(void *dummy __unused)
 
 			if (arc_no_grow) {
 				if (last_reclaim == ARC_RECLAIM_CONS) {
+					DTRACE_PROBE(arc__reclaim_aggr_no_grow);
 					last_reclaim = ARC_RECLAIM_AGGR;
 				} else {
 					last_reclaim = ARC_RECLAIM_CONS;
@@ -2632,6 +2663,7 @@ arc_reclaim_thread(void *dummy __unused)
 			} else {
 				arc_no_grow = TRUE;
 				last_reclaim = ARC_RECLAIM_AGGR;
+				DTRACE_PROBE(arc__reclaim_aggr);
 				membar_producer();
 			}
 
@@ -2736,6 +2768,7 @@ arc_adapt(int bytes, arc_state_t *state)
 	 * cache size, increment the target cache size
 	 */
 	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
+		DTRACE_PROBE1(arc__inc_adapt, int, bytes);
 		atomic_add_64(&arc_c, (int64_t)bytes);
 		if (arc_c > arc_c_max)
 			arc_c = arc_c_max;
@@ -2757,20 +2790,6 @@ arc_evict_needed(arc_buf_contents_t type
 	if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
 		return (1);
 
-#ifdef sun
-#ifdef _KERNEL
-	/*
-	 * If zio data pages are being allocated out of a separate heap segment,
-	 * then enforce that the size of available vmem for this area remains
-	 * above about 1/32nd free.
-	 */
-	if (type == ARC_BUFC_DATA && zio_arena != NULL &&
-	    vmem_size(zio_arena, VMEM_FREE) <
-	    (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
-		return (1);
-#endif
-#endif	/* sun */
-
 	if (arc_reclaim_needed())
 		return (1);
 
@@ -3929,20 +3948,16 @@ static int
 arc_memory_throttle(uint64_t reserve, uint64_t txg)
 {
 #ifdef _KERNEL
-	uint64_t available_memory =
-	    ptoa((uintmax_t)vm_cnt.v_free_count + vm_cnt.v_cache_count);
+	uint64_t available_memory = ptob(freemem);
 	static uint64_t page_load = 0;
 	static uint64_t last_txg = 0;
 
-#ifdef sun
-#if defined(__i386)
+#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
 	available_memory =
-	    MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
+	    MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE)));
 #endif
-#endif	/* sun */
 
-	if (vm_cnt.v_free_count + vm_cnt.v_cache_count >
-	    (uint64_t)physmem * arc_lotsfree_percent / 100)
+	if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
 		return (0);
 
 	if (txg > last_txg) {
@@ -3955,7 +3970,7 @@ arc_memory_throttle(uint64_t reserve, ui
 	 * continue to let page writes occur as quickly as possible.
 	 */
 	if (curproc == pageproc) {
-		if (page_load > available_memory / 4)
+		if (page_load > MAX(ptob(minfree), available_memory) / 4)
 			return (SET_ERROR(ERESTART));
 		/* Note: reserve is inflated, so we deflate */
 		page_load += reserve / 8;
@@ -3983,8 +3998,10 @@ arc_tempreserve_space(uint64_t reserve, 
 	int error;
 	uint64_t anon_size;
 
-	if (reserve > arc_c/4 && !arc_no_grow)
+	if (reserve > arc_c/4 && !arc_no_grow) {
 		arc_c = MIN(arc_c_max, reserve * 4);
+		DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
+	}
 	if (reserve > arc_c)
 		return (SET_ERROR(ENOMEM));
 
@@ -4038,6 +4055,7 @@ arc_lowmem(void *arg __unused, int howto
 	mutex_enter(&arc_lowmem_lock);
 	mutex_enter(&arc_reclaim_thr_lock);
 	needfree = 1;
+	DTRACE_PROBE(arc__needfree);
 	cv_signal(&arc_reclaim_thr_cv);
 
 	/*

Modified: head/sys/vm/vm_pageout.c
==============================================================================
--- head/sys/vm/vm_pageout.c	Fri Oct  3 20:24:56 2014	(r272482)
+++ head/sys/vm/vm_pageout.c	Fri Oct  3 20:34:55 2014	(r272483)
@@ -76,6 +76,7 @@
 __FBSDID("$FreeBSD$");
 
 #include "opt_vm.h"
+#include "opt_kdtrace.h"
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
@@ -89,6 +90,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/racct.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
+#include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/smp.h>
 #include <sys/vnode.h>
@@ -133,6 +135,10 @@ static struct kproc_desc page_kp = {
 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,
     &page_kp);
 
+SDT_PROVIDER_DEFINE(vm);
+SDT_PROBE_DEFINE(vm, , , vm__lowmem_cache);
+SDT_PROBE_DEFINE(vm, , , vm__lowmem_scan);
+
 #if !defined(NO_SWAPPING)
 /* the kernel process "vm_daemon"*/
 static void vm_daemon(void);
@@ -667,6 +673,7 @@ vm_pageout_grow_cache(int tries, vm_padd
 		 * may acquire locks and/or sleep, so they can only be invoked
 		 * when "tries" is greater than zero.
 		 */
+		SDT_PROBE0(vm, , , vm__lowmem_cache);
 		EVENTHANDLER_INVOKE(vm_lowmem, 0);
 
 		/*
@@ -920,6 +927,7 @@ vm_pageout_scan(struct vm_domain *vmd, i
 		/*
 		 * Decrease registered cache sizes.
 		 */
+		SDT_PROBE0(vm, , , vm__lowmem_scan);
 		EVENTHANDLER_INVOKE(vm_lowmem, 0);
 		/*
 		 * We do this explicitly after the caches have been



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201410032034.s93KYuUD075578>