Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 23 Aug 2014 02:37:10 +0100
From:      "Steven Hartland" <killing@multiplay.co.uk>
To:        "John" <john@theusgroup.com>, <freebsd-fs@freebsd.org>
Subject:   Re: [Bug 187594] [zfs] [patch] ZFS ARC behavior problem and fix
Message-ID:  <F86D0B957D594F1B8FC2FF02A563E3AE@multiplay.co.uk>
References:  <bug-187594-3630@https.bugs.freebsd.org/bugzilla/> <bug-187594-3630-o97lWSCObB@https.bugs.freebsd.org/bugzilla/> <CE3228E2C273EA4A9DC12FA75E921B8C4BC4948E6F@PAIMAIL.pai.local> <53F73A39.9090000@freebsd.org> <20140822211819.62F8096D@mail.theusgroup.com>

next in thread | previous in thread | raw e-mail | index | archive | help
This is a multi-part message in MIME format.

------=_NextPart_000_04A6_01CFBE7B.23C80680
Content-Type: text/plain;
	format=flowed;
	charset="iso-8859-1";
	reply-type=original
Content-Transfer-Encoding: 7bit

----- Original Message ----- 
From: "John via freebsd-fs" <freebsd-fs@freebsd.org>

> Given how long this patch has been in use with nothing but positive 
> feedback,
> and still having not been committed, one has to wonder why?
>
> Is it NIH, and something else. It least one committer commented in the 
> past
> that Karl's approach isn't how he would have done it. Is that the 
> problem?
>
> It's ridiculous we've had to keep adding this patch to keep our zfs 
> systems
> running with decent performance.
>
> Why hasn't this been committed?

I've actually been looking at this patch today in relation to my
investigation of:
https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=191510

I would appreciate it if people could test the attached patch, which
was created against stable/10

It should achieve the same as Karl's patch as well as:
* More closely matching original Solaris logic
* Provide better control of the reclaim trigger (absolute not
  percentage based, which becomes a problem in larger memory
  machines)
* Uses direct kernel values instead of interfacing via sysctl's.
* Should fix the issue identified in #191510 as well.

Basic design is it will trigger ARC reclaim when free pages drops
below vfs.zfs.arc_free_target, which by default is 3 x that of the
VM's target free pages as exposed by vm.v_free_target (matching
Solaris).

Its really late here now and I've only just knocked it together to
test it out on our event big cache box over the weekend, so it may
be a little rough.

All feedback welcome :)

    Regards
    Steve 

------=_NextPart_000_04A6_01CFBE7B.23C80680
Content-Type: application/octet-stream;
	name="arc-reclaim.patch"
Content-Transfer-Encoding: quoted-printable
Content-Disposition: attachment;
	filename="arc-reclaim.patch"

Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c=0A=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=0A=
--- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	(revision =
270315)=0A=
+++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	(working copy)=0A=
@@ -138,6 +138,7 @@=0A=
 #include <sys/sdt.h>=0A=
 =0A=
 #include <vm/vm_pageout.h>=0A=
+#include <sys/vmmeter.h>=0A=
 =0A=
 #ifdef illumos=0A=
 #ifndef _KERNEL=0A=
@@ -204,11 +205,23 @@=0A=
 int zfs_arc_p_min_shift =3D 0;=0A=
 int zfs_disable_dup_eviction =3D 0;=0A=
 uint64_t zfs_arc_average_blocksize =3D 8 * 1024; /* 8KB */=0A=
+u_int zfs_arc_free_target =3D (1 << 30) / PAGE_SIZE; /* 1GB */=0A=
 =0A=
+static void=0A=
+arc_free_target_init(void *unused __unused)=0A=
+{=0A=
+=0A=
+    zfs_arc_free_target =3D (uint64_t)cnt.v_free_target * 3;=0A=
+}=0A=
+SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,=0A=
+    arc_free_target_init, NULL);=0A=
+=0A=
+=0A=
 TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);=0A=
 TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);=0A=
 TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);=0A=
 TUNABLE_QUAD("vfs.zfs.arc_average_blocksize", =
&zfs_arc_average_blocksize);=0A=
+TUNABLE_INT("vfs.zfs.arc_free_target", &zfs_arc_free_target);=0A=
 SYSCTL_DECL(_vfs_zfs);=0A=
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, =
0,=0A=
     "Maximum ARC size");=0A=
@@ -217,6 +230,9 @@=0A=
 SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,=0A=
     &zfs_arc_average_blocksize, 0,=0A=
     "ARC average blocksize");=0A=
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_free_target, CTLFLAG_RWTUN,=0A=
+    &zfs_arc_free_target, 0,=0A=
+    "Desired number of free pages below which ARC triggers reclaim");=0A=
 =0A=
 /*=0A=
  * Note that buffers can be in one of 6 states:=0A=
@@ -2458,6 +2474,9 @@=0A=
 	if (needfree)=0A=
 		return (1);=0A=
 =0A=
+	if (cnt.v_free_count < zfs_arc_free_target)=0A=
+		return (1);=0A=
+=0A=
 	/*=0A=
 	 * Cooperate with pagedaemon when it's time for it to scan=0A=
 	 * and reclaim some pages.=0A=
@@ -2507,9 +2526,6 @@=0A=
 	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))=0A=
 		return (1);=0A=
 #endif=0A=
-#else	/* !sun */=0A=
-	if (kmem_used() > (kmem_size() * 3) / 4)=0A=
-		return (1);=0A=
 #endif	/* sun */=0A=
 =0A=
 #else=0A=
Index: sys/vm/vm_pageout.c=0A=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=0A=
--- sys/vm/vm_pageout.c	(revision 270315)=0A=
+++ sys/vm/vm_pageout.c	(working copy)=0A=
@@ -115,10 +115,14 @@=0A=
 =0A=
 /* the kernel process "vm_pageout"*/=0A=
 static void vm_pageout(void);=0A=
+static void vm_pageout_init(void);=0A=
 static int vm_pageout_clean(vm_page_t);=0A=
 static void vm_pageout_scan(struct vm_domain *vmd, int pass);=0A=
 static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass);=0A=
 =0A=
+SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, =
vm_pageout_init,=0A=
+    NULL);=0A=
+=0A=
 struct proc *pageproc;=0A=
 =0A=
 static struct kproc_desc page_kp =3D {=0A=
@@ -126,7 +130,7 @@=0A=
 	vm_pageout,=0A=
 	&pageproc=0A=
 };=0A=
-SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start,=0A=
+SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start,=0A=
     &page_kp);=0A=
 =0A=
 #if !defined(NO_SWAPPING)=0A=
@@ -1647,15 +1651,11 @@=0A=
 }=0A=
 =0A=
 /*=0A=
- *	vm_pageout is the high level pageout daemon.=0A=
+ *	vm_pageout_init initialises basic pageout daemon settings.=0A=
  */=0A=
 static void=0A=
-vm_pageout(void)=0A=
+vm_pageout_init(void)=0A=
 {=0A=
-#if MAXMEMDOM > 1=0A=
-	int error, i;=0A=
-#endif=0A=
-=0A=
 	/*=0A=
 	 * Initialize some paging parameters.=0A=
 	 */=0A=
@@ -1701,7 +1701,18 @@=0A=
 	/* XXX does not really belong here */=0A=
 	if (vm_page_max_wired =3D=3D 0)=0A=
 		vm_page_max_wired =3D cnt.v_free_count / 3;=0A=
+}=0A=
 =0A=
+/*=0A=
+ *	vm_pageout is the high level pageout daemon.=0A=
+ */=0A=
+static void=0A=
+vm_pageout(void)=0A=
+{=0A=
+#if MAXMEMDOM > 1=0A=
+	int error, i;=0A=
+#endif=0A=
+=0A=
 	swap_pager_swap_init();=0A=
 #if MAXMEMDOM > 1=0A=
 	for (i =3D 1; i < vm_ndomains; i++) {=0A=

------=_NextPart_000_04A6_01CFBE7B.23C80680--




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?F86D0B957D594F1B8FC2FF02A563E3AE>