From owner-freebsd-stable@FreeBSD.ORG Mon Jul 12 09:39:42 2010 Return-Path: Delivered-To: freebsd-stable@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id F13FB106566C for ; Mon, 12 Jul 2010 09:39:41 +0000 (UTC) (envelope-from peterjeremy@acm.org) Received: from mail14.syd.optusnet.com.au (mail14.syd.optusnet.com.au [211.29.132.195]) by mx1.freebsd.org (Postfix) with ESMTP id 6F9308FC24 for ; Mon, 12 Jul 2010 09:39:41 +0000 (UTC) Received: from server.vk2pj.dyndns.org (c211-30-160-13.belrs4.nsw.optusnet.com.au [211.30.160.13]) by mail14.syd.optusnet.com.au (8.13.1/8.13.1) with ESMTP id o6C9db9q032537 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=NO); Mon, 12 Jul 2010 19:39:38 +1000 X-Bogosity: Ham, spamicity=0.000000 Received: from server.vk2pj.dyndns.org (localhost.vk2pj.dyndns.org [127.0.0.1]) by server.vk2pj.dyndns.org (8.14.4/8.14.4) with ESMTP id o6C9dYRl027963; Mon, 12 Jul 2010 19:39:34 +1000 (EST) (envelope-from peter@server.vk2pj.dyndns.org) Received: (from peter@localhost) by server.vk2pj.dyndns.org (8.14.4/8.14.4/Submit) id o6C9dXlr027962; Mon, 12 Jul 2010 19:39:33 +1000 (EST) (envelope-from peter) Date: Mon, 12 Jul 2010 19:39:33 +1000 From: Peter Jeremy To: Richard Lee Message-ID: <20100712093933.GA27950@server.vk2pj.dyndns.org> References: <20100711182511.GA21063@soda.CSUA.Berkeley.EDU> <20100712093818.GA27693@server.vk2pj.dyndns.org> MIME-Version: 1.0 Content-Type: multipart/signed; micalg=pgp-sha1; protocol="application/pgp-signature"; boundary="mojUlQ0s9EVzWg2t" Content-Disposition: inline In-Reply-To: <20100712093818.GA27693@server.vk2pj.dyndns.org> X-PGP-Key: http://members.optusnet.com.au/peterjeremy/pubkey.asc User-Agent: Mutt/1.5.20 (2009-06-14) Cc: freebsd-stable@freebsd.org Subject: Re: Serious zfs slowdown when mixed with another file system (ufs/msdosfs/etc.). X-BeenThere: freebsd-stable@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: Production branch of FreeBSD source code List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 12 Jul 2010 09:39:42 -0000 --mojUlQ0s9EVzWg2t Content-Type: multipart/mixed; boundary="RnlQjJ0d97Da+TV1" Content-Disposition: inline --RnlQjJ0d97Da+TV1 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline Content-Transfer-Encoding: quoted-printable On 2010-Jul-12 19:38:18 +1000, Peter Jeremy = wrote: >I have been using the attached arc.patch1 based on a patch written by >Artem Belevich (see http://pastebin.com/ZCkzkWcs ) >for about a month. I have had reasonable success with it (and junked >my cronjob) but have managed to wedge my system a couple of times >whilst doing zfs send|recv. Whilst looking at that diff, I just >noticed a nasty signed/unsigned bug that could bite in low memory >conditions and have revised it to arc.patch2 (untested as yet). Let try actually attaching those patches... Sorry. --=20 Peter Jeremy --RnlQjJ0d97Da+TV1 Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="arc.patch1" Content-Transfer-Encoding: quoted-printable Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D RCS file: /usr/ncvs/src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.= c,v retrieving revision 1.22.2.6 diff -u -r1.22.2.6 arc.c --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c 24 May 2010 20:09:= 40 -0000 1.22.2.6 +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c 12 Jun 2010 21:04:= 13 -0000 @@ -183,10 +183,15 @@ int zfs_arc_shrink_shift =3D 0; int zfs_arc_p_min_shift =3D 0; =20 +uint64_t zfs_arc_bp_active; +uint64_t zfs_arc_bp_inactive; + TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable); +TUNABLE_QUAD("vfs.zfs.arc_bp_active", &zfs_arc_bp_active); +TUNABLE_QUAD("vfs.zfs.arc_bp_inactive", &zfs_arc_bp_inactive); SYSCTL_DECL(_vfs_zfs); SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, "Maximum ARC size"); @@ -195,6 +200,11 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN, &zfs_mdcomp_disable, 0, "Disable metadata compression"); =20 +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_bp_active, CTLFLAG_RW|CTLFLAG_TUN, &zf= s_arc_bp_active, 0, + "Start ARC backpressure if active memory is below this limit"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_bp_inactive, CTLFLAG_RW|CTLFLAG_TUN, &= zfs_arc_bp_inactive, 0, + "Start ARC backpressure if inactive memory is below this limit"); + /* * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) @@ -2103,7 +2113,6 @@ } =20 static int needfree =3D 0; - static int arc_reclaim_needed(void) { @@ -2112,20 +2121,58 @@ #endif =20 #ifdef _KERNEL - if (needfree) - return (1); + /* We've grown too much, */ if (arc_size > arc_c_max) return (1); + + /* Pagedaemon is stuck, let's free something right away */ + if (vm_pageout_pages_needed) + return 1; + + /* Check if inactive list have grown too much */ + if ( zfs_arc_bp_inactive + && (ptoa((uintmax_t)cnt.v_inactive_count) > zfs_arc_bp_inactive)) { + /* tell pager to reap 1/2th of inactive queue*/ + atomic_add_int(&vm_pageout_deficit, cnt.v_inactive_count/2); + pagedaemon_wakeup(); + return needfree; + } + + /* Same for active list... */ + if ( zfs_arc_bp_active + && (ptoa((uintmax_t)cnt.v_active_count) > zfs_arc_bp_active)) { + atomic_add_int(&vm_pageout_deficit, cnt.v_active_count/2); + pagedaemon_wakeup(); + return needfree; + } + +=09 + /* Old style behavior -- ARC gives up memory whenever page daemon asks.. = */ + if (needfree) + return 1; + + /* + We got here either because active/inactive lists are + getting short or because we've been called during voluntary + ARC size checks. Kind of gray area... + */ + + /* If we didn't reach our minimum yet, don't rush to give memory up..*/ if (arc_size <=3D arc_c_min) return (0); =20 + /* If we're really short on memory now, give it up. */ + if (vm_page_count_min()) { + return (1); + } +=09 /* - * If pages are needed or we're within 2048 pages - * of needing to page need to reclaim + * If we're within 2048 pages of pagedaemon start, reclaim... */ - if (vm_pages_needed || (vm_paging_target() > -2048)) + if (vm_pages_needed && (vm_paging_target() > -2048)) return (1); =20 + #if 0 /* * take 'desfree' extra pages, so we reclaim sooner, rather than later @@ -2169,8 +2216,6 @@ return (1); #endif #else - if (kmem_used() > (kmem_size() * 3) / 4) - return (1); #endif =20 #else @@ -2279,7 +2324,7 @@ if (arc_eviction_list !=3D NULL) arc_do_user_evicts(); =20 - if (arc_reclaim_needed()) { + if (needfree) { needfree =3D 0; #ifdef _KERNEL wakeup(&needfree); @@ -3611,10 +3656,15 @@ { #ifdef _KERNEL uint64_t inflight_data =3D arc_anon->arcs_size; - uint64_t available_memory =3D ptoa((uintmax_t)cnt.v_free_count); + uint64_t available_memory; static uint64_t page_load =3D 0; static uint64_t last_txg =3D 0; =20 + /* How much memory is potentially available */ + available_memory =3D ptoa((uintmax_t)cnt.v_free_count); + available_memory +=3D ptoa((uintmax_t)cnt.v_cache_count); + available_memory -=3D ptoa((uintmax_t)cnt.v_free_min); + =20 #if 0 #if defined(__i386) available_memory =3D --RnlQjJ0d97Da+TV1 Content-Type: text/plain; charset=us-ascii Content-Disposition: attachment; filename="arc.patch2" Content-Transfer-Encoding: quoted-printable Index: sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D= =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D RCS file: /usr/ncvs/src/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.= c,v retrieving revision 1.22.2.6 diff -u -r1.22.2.6 arc.c --- sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c 24 May 2010 20:09:= 40 -0000 1.22.2.6 +++ sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c 12 Jul 2010 09:21:= 31 -0000 @@ -183,10 +183,15 @@ int zfs_arc_shrink_shift =3D 0; int zfs_arc_p_min_shift =3D 0; =20 +uint64_t zfs_arc_bp_active; +uint64_t zfs_arc_bp_inactive; + TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max); TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min); TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit); TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable); +TUNABLE_QUAD("vfs.zfs.arc_bp_active", &zfs_arc_bp_active); +TUNABLE_QUAD("vfs.zfs.arc_bp_inactive", &zfs_arc_bp_inactive); SYSCTL_DECL(_vfs_zfs); SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0, "Maximum ARC size"); @@ -195,6 +200,11 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN, &zfs_mdcomp_disable, 0, "Disable metadata compression"); =20 +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_bp_active, CTLFLAG_RW|CTLFLAG_TUN, &zf= s_arc_bp_active, 0, + "Start ARC backpressure if active memory is below this limit"); +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, arc_bp_inactive, CTLFLAG_RW|CTLFLAG_TUN, &= zfs_arc_bp_inactive, 0, + "Start ARC backpressure if inactive memory is below this limit"); + /* * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) @@ -2103,7 +2113,6 @@ } =20 static int needfree =3D 0; - static int arc_reclaim_needed(void) { @@ -2112,20 +2121,58 @@ #endif =20 #ifdef _KERNEL - if (needfree) - return (1); + /* We've grown too much, */ if (arc_size > arc_c_max) return (1); + + /* Pagedaemon is stuck, let's free something right away */ + if (vm_pageout_pages_needed) + return 1; + + /* Check if inactive list have grown too much */ + if ( zfs_arc_bp_inactive + && (ptoa((uintmax_t)cnt.v_inactive_count) > zfs_arc_bp_inactive)) { + /* tell pager to reap 1/2th of inactive queue*/ + atomic_add_int(&vm_pageout_deficit, cnt.v_inactive_count/2); + pagedaemon_wakeup(); + return needfree; + } + + /* Same for active list... */ + if ( zfs_arc_bp_active + && (ptoa((uintmax_t)cnt.v_active_count) > zfs_arc_bp_active)) { + atomic_add_int(&vm_pageout_deficit, cnt.v_active_count/2); + pagedaemon_wakeup(); + return needfree; + } + +=09 + /* Old style behavior -- ARC gives up memory whenever page daemon asks.. = */ + if (needfree) + return 1; + + /* + We got here either because active/inactive lists are + getting short or because we've been called during voluntary + ARC size checks. Kind of gray area... + */ + + /* If we didn't reach our minimum yet, don't rush to give memory up..*/ if (arc_size <=3D arc_c_min) return (0); =20 + /* If we're really short on memory now, give it up. */ + if (vm_page_count_min()) { + return (1); + } +=09 /* - * If pages are needed or we're within 2048 pages - * of needing to page need to reclaim + * If we're within 2048 pages of pagedaemon start, reclaim... */ - if (vm_pages_needed || (vm_paging_target() > -2048)) + if (vm_pages_needed && (vm_paging_target() > -2048)) return (1); =20 + #if 0 /* * take 'desfree' extra pages, so we reclaim sooner, rather than later @@ -2169,8 +2216,6 @@ return (1); #endif #else - if (kmem_used() > (kmem_size() * 3) / 4) - return (1); #endif =20 #else @@ -2279,7 +2324,7 @@ if (arc_eviction_list !=3D NULL) arc_do_user_evicts(); =20 - if (arc_reclaim_needed()) { + if (needfree) { needfree =3D 0; #ifdef _KERNEL wakeup(&needfree); @@ -3611,10 +3656,17 @@ { #ifdef _KERNEL uint64_t inflight_data =3D arc_anon->arcs_size; - uint64_t available_memory =3D ptoa((uintmax_t)cnt.v_free_count); + uint64_t available_memory; static uint64_t page_load =3D 0; static uint64_t last_txg =3D 0; =20 + /* How much memory is potentially available */ + available_memory =3D (uint64_t)cnt.v_free_count + cnt.v_cache_count; + if (available_memory > cnt.v_free_min) + available_memory =3D ptoa(available_memory - cnt.v_free_min); + else + available_memory =3D 0; + #if 0 #if defined(__i386) available_memory =3D --RnlQjJ0d97Da+TV1-- --mojUlQ0s9EVzWg2t Content-Type: application/pgp-signature Content-Disposition: inline -----BEGIN PGP SIGNATURE----- Version: GnuPG v2.0.15 (FreeBSD) iEYEARECAAYFAkw64tUACgkQ/opHv/APuIdaZACfbuBNLVzG4Ktjw5uy5rmh1xrz PHIAmQFCVsVKV8DQg2BrlcwJulbXST89 =coqu -----END PGP SIGNATURE----- --mojUlQ0s9EVzWg2t--