From owner-freebsd-hackers  Fri Feb 22 14: 6: 1 2002
Delivered-To: freebsd-hackers@freebsd.org
Received: from apollo.backplane.com (apollo.backplane.com [216.240.41.2])
	by hub.freebsd.org (Postfix) with ESMTP id F137C37B402
	for <hackers@FreeBSD.ORG>; Fri, 22 Feb 2002 14:05:43 -0800 (PST)
Received: (from dillon@localhost)
	by apollo.backplane.com (8.11.6/8.9.1) id g1MM1f431236;
	Fri, 22 Feb 2002 14:01:41 -0800 (PST)
	(envelope-from dillon)
Date: Fri, 22 Feb 2002 14:01:41 -0800 (PST)
From: Matthew Dillon <dillon@apollo.backplane.com>
Message-Id: <200202222201.g1MM1f431236@apollo.backplane.com>
To: Andrew Mobbs <andrewm@chiark.greenend.org.uk>,
	Paul Saab <ps@yahoo-inc.com>, Peter Wemm <peter@wemm.org>
Cc: hackers@FreeBSD.ORG
Subject: Test patch for msync/object-flushing performance (for stable)
References: <15478.31998.459219.178549@chiark.greenend.org.uk> <200202222042.g1MKg4u22700@apollo.backplane.com>
Sender: owner-freebsd-hackers@FreeBSD.ORG
Precedence: bulk
List-ID: <freebsd-hackers.FreeBSD.ORG>
List-Archive: <http://docs.freebsd.org/mail/> (Web Archive)
List-Help: <mailto:majordomo@FreeBSD.ORG?subject=help> (List Instructions)
List-Subscribe: <mailto:majordomo@FreeBSD.ORG?subject=subscribe%20freebsd-hackers>
List-Unsubscribe: <mailto:majordomo@FreeBSD.ORG?subject=unsubscribe%20freebsd-hackers>
X-Loop: FreeBSD.ORG

    Ok, here is a test patch.  Now, there are some instructions to go along
    with this patch, so continue reading.

    I have implemented two optimizations.  You can turn either or both (or
    neither) on with a sysctl.  I would like those interested to test all
    four combinations.  Be sure to delete any test files and 'sync' a couple
    of times between each test run so you do not skew the results.

    sysctl -w vm.msync_flush_flags=0

	No optimizations.  We don't try to sort the object flush
	(original behavior).  This is the default for this test patch.

    sysctl -w vm.msync_flush_flags=1

	Hard sequential optimization.  Attempt to locate sequential pages
	by indexing through the requested flush range, performing
	vm_page_lookup()'s.  If we miss more then a certain number in a row,
	however, we break out of the loop (otherwise this can lockup the
	system when flushing a very large multi-gigabyte or multi-terrabyte
	object).

	This optimization works best when the user is msync()ing a specific
	known-to-be-mostly-dirty page range.  The only downside is that this
	can eat more cpu for other cases.  However, the upside is that for
	huge objects and small page ranges this optimization allows us to
	completely avoid scanning the object's memq, yielding an extreme
	performance benefit.

    sysctl -w vm.msync_flush_flags=2

	Soft sequential optimization during object->memq scan.  
	vm_object_page_clean() already attempts to cluster write operations
	but is limited to around 16 pages.  This optimization attempts to
	'glue' clustered ops together by looking for the next sequential
	page after the cluster that was just flushed and jumping to it for
	the next cluster.

    sysctl -w vm.msync_flush_flags=3

	This turns on both optimizations.

    I do not formally sort the object->memq.  I looked at doing so but it
    looked fairly expensive.

						-Matt

Index: vm/vm_object.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_object.c,v
retrieving revision 1.171.2.5
diff -u -r1.171.2.5 vm_object.c
--- vm/vm_object.c	3 Nov 2001 19:59:28 -0000	1.171.2.5
+++ vm/vm_object.c	22 Feb 2002 21:52:03 -0000
@@ -75,6 +75,8 @@
 #include <sys/vmmeter.h>
 #include <sys/mman.h>
 #include <sys/mount.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -89,7 +91,18 @@
 #include <vm/vm_extern.h>
 #include <vm/vm_zone.h>
 
-static void	vm_object_qcollapse __P((vm_object_t object));
+#define EASY_SCAN_FACTOR	8
+
+#define MSYNC_FLUSH_HARDSEQUENTIAL	0x01
+#define MSYNC_FLUSH_SOFTSEQUENTIAL	0x02
+
+static int msync_flush_flags = 0;
+SYSCTL_INT(_vm, OID_AUTO, msync_flush_flags,
+        CTLFLAG_RW, &msync_flush_flags, 0, "");
+
+
+static void	vm_object_qcollapse (vm_object_t object);
+static int	vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags);
 
 /*
  *	Virtual memory objects maintain the actual data
@@ -506,21 +519,12 @@
 	vm_pindex_t end;
 	int flags;
 {
-	vm_page_t p, np, tp;
+	vm_page_t p, np;
 	vm_offset_t tstart, tend;
 	vm_pindex_t pi;
-	int s;
 	struct vnode *vp;
-	int runlen;
-	int maxf;
-	int chkb;
-	int maxb;
-	int i;
 	int clearobjflags;
 	int pagerflags;
-	vm_page_t maf[vm_pageout_page_count];
-	vm_page_t mab[vm_pageout_page_count];
-	vm_page_t ma[vm_pageout_page_count];
 	int curgeneration;
 
 	if (object->type != OBJT_VNODE ||
@@ -534,6 +538,9 @@
 
 	vm_object_set_flag(object, OBJ_CLEANING);
 
+	/*
+	 * Handle 'entire object' case
+	 */
 	tstart = start;
 	if (end == 0) {
 		tend = object->size;
@@ -542,6 +549,72 @@
 	}
 
 	/*
+	 * If the caller is smart and only msync()s a range he knows is
+	 * dirty, we may be able to avoid an object scan.  This results in
+	 * a phenominal improvement in performance.  We cannot do this
+	 * as a matter of course because the object may be huge - e.g.
+	 * the size might be in the gigabytes or terrabytes.
+	 */
+	if (msync_flush_flags & MSYNC_FLUSH_HARDSEQUENTIAL) {
+		vm_offset_t tscan;
+		int scanlimit;
+		int scanreset;
+
+		scanreset = object->resident_page_count / EASY_SCAN_FACTOR;
+		if (scanreset < 16)
+			scanreset = 16;
+
+		scanlimit = scanreset;
+		tscan = tstart;
+		while (tscan < tend) {
+			curgeneration = object->generation;
+			p = vm_page_lookup(object, tscan);
+			if (p == NULL || p->valid == 0 ||
+			    (p->queue - p->pc) == PQ_CACHE) {
+				if (--scanlimit == 0)
+					break;
+				++tscan;
+				continue;
+			}
+			vm_page_test_dirty(p);
+			if ((p->dirty & p->valid) == 0) {
+				if (--scanlimit == 0)
+					break;
+				++tscan;
+				continue;
+			}
+			/*
+			 * If we have been asked to skip nosync pages and 
+			 * this is a nosync page, we can't continue.
+			 */
+			if ((flags & OBJPC_NOSYNC) && (p->flags & PG_NOSYNC)) {
+				if (--scanlimit == 0)
+					break;
+				++tscan;
+				continue;
+			}
+			scanlimit = scanreset;
+
+			/*
+			 * This returns 0 if it was unable to busy the first
+			 * page (i.e. had to sleep).
+			 */
+			tscan += vm_object_page_collect_flush(object, p, curgeneration, pagerflags);
+		}
+
+		/*
+		 * If everything was dirty and we flushed it successfully,
+		 * and the requested range is not the entire object, we
+		 * don't have to mess with CLEANCHK or MIGHTBEDIRTY and can
+		 * return immediately.
+		 */
+		if (tscan >= tend && (tstart || tend < object->size)) {
+			vm_object_clear_flag(object, OBJ_CLEANING);
+			return;
+		}
+	}
+
+	/*
 	 * Generally set CLEANCHK interlock and make the page read-only so
 	 * we can then clear the object flags.
 	 *
@@ -578,8 +651,11 @@
 	curgeneration = object->generation;
 
 	for(p = TAILQ_FIRST(&object->memq); p; p = np) {
+		int n;
+
 		np = TAILQ_NEXT(p, listq);
 
+again:
 		pi = p->pindex;
 		if (((p->flags & PG_CLEANCHK) == 0) ||
 			(pi < tstart) || (pi >= tend) ||
@@ -605,17 +681,86 @@
 			continue;
 		}
 
-		s = splvm();
-		while (vm_page_sleep_busy(p, TRUE, "vpcwai")) {
-			if (object->generation != curgeneration) {
-				splx(s);
-				goto rescan;
+		n = vm_object_page_collect_flush(object, p,
+			curgeneration, pagerflags);
+		if (n == 0)
+			goto rescan;
+		if (object->generation != curgeneration)
+			goto rescan;
+
+		/*
+		 * Try to optimize the next page.  If we can't we pick up
+		 * our (random) scan where we left off.
+		 */
+		if (msync_flush_flags & MSYNC_FLUSH_SOFTSEQUENTIAL) {
+			if ((p = vm_page_lookup(object, pi + n)) != NULL)
+				goto again;
+		}
+	}
+
+#if 0
+	VOP_FSYNC(vp, NULL, (pagerflags & VM_PAGER_PUT_SYNC)?MNT_WAIT:0, curproc);
+#endif
+
+	vm_object_clear_flag(object, OBJ_CLEANING);
+	return;
+}
+
+static int
+vm_object_page_collect_flush(vm_object_t object, vm_page_t p, int curgeneration, int pagerflags)
+{
+	int runlen;
+	int s;
+	int maxf;
+	int chkb;
+	int maxb;
+	int i;
+	vm_pindex_t pi;
+	vm_page_t maf[vm_pageout_page_count];
+	vm_page_t mab[vm_pageout_page_count];
+	vm_page_t ma[vm_pageout_page_count];
+
+	s = splvm();
+	pi = p->pindex;
+	while (vm_page_sleep_busy(p, TRUE, "vpcwai")) {
+		if (object->generation != curgeneration) {
+			splx(s);
+			return(0);
+		}
+	}
+
+	maxf = 0;
+	for(i = 1; i < vm_pageout_page_count; i++) {
+		vm_page_t tp;
+
+		if ((tp = vm_page_lookup(object, pi + i)) != NULL) {
+			if ((tp->flags & PG_BUSY) ||
+				(tp->flags & PG_CLEANCHK) == 0 ||
+				(tp->busy != 0))
+				break;
+			if((tp->queue - tp->pc) == PQ_CACHE) {
+				vm_page_flag_clear(tp, PG_CLEANCHK);
+				break;
+			}
+			vm_page_test_dirty(tp);
+			if ((tp->dirty & tp->valid) == 0) {
+				vm_page_flag_clear(tp, PG_CLEANCHK);
+				break;
 			}
+			maf[ i - 1 ] = tp;
+			maxf++;
+			continue;
 		}
+		break;
+	}
+
+	maxb = 0;
+	chkb = vm_pageout_page_count -  maxf;
+	if (chkb) {
+		for(i = 1; i < chkb;i++) {
+			vm_page_t tp;
 
-		maxf = 0;
-		for(i=1;i<vm_pageout_page_count;i++) {
-			if ((tp = vm_page_lookup(object, pi + i)) != NULL) {
+			if ((tp = vm_page_lookup(object, pi - i)) != NULL) {
 				if ((tp->flags & PG_BUSY) ||
 					(tp->flags & PG_CLEANCHK) == 0 ||
 					(tp->busy != 0))
@@ -629,71 +774,45 @@
 					vm_page_flag_clear(tp, PG_CLEANCHK);
 					break;
 				}
-				maf[ i - 1 ] = tp;
-				maxf++;
+				mab[ i - 1 ] = tp;
+				maxb++;
 				continue;
 			}
 			break;
 		}
+	}
 
-		maxb = 0;
-		chkb = vm_pageout_page_count -  maxf;
-		if (chkb) {
-			for(i = 1; i < chkb;i++) {
-				if ((tp = vm_page_lookup(object, pi - i)) != NULL) {
-					if ((tp->flags & PG_BUSY) ||
-						(tp->flags & PG_CLEANCHK) == 0 ||
-						(tp->busy != 0))
-						break;
-					if((tp->queue - tp->pc) == PQ_CACHE) {
-						vm_page_flag_clear(tp, PG_CLEANCHK);
-						break;
-					}
-					vm_page_test_dirty(tp);
-					if ((tp->dirty & tp->valid) == 0) {
-						vm_page_flag_clear(tp, PG_CLEANCHK);
-						break;
-					}
-					mab[ i - 1 ] = tp;
-					maxb++;
-					continue;
-				}
-				break;
-			}
-		}
+	for(i = 0; i < maxb; i++) {
+		int index = (maxb - i) - 1;
+		ma[index] = mab[i];
+		vm_page_flag_clear(ma[index], PG_CLEANCHK);
+	}
+	vm_page_flag_clear(p, PG_CLEANCHK);
+	ma[maxb] = p;
+	for(i = 0; i < maxf; i++) {
+		int index = (maxb + i) + 1;
+		ma[index] = maf[i];
+		vm_page_flag_clear(ma[index], PG_CLEANCHK);
+	}
+	runlen = maxb + maxf + 1;
 
-		for(i=0;i<maxb;i++) {
-			int index = (maxb - i) - 1;
-			ma[index] = mab[i];
-			vm_page_flag_clear(ma[index], PG_CLEANCHK);
-		}
-		vm_page_flag_clear(p, PG_CLEANCHK);
-		ma[maxb] = p;
-		for(i=0;i<maxf;i++) {
-			int index = (maxb + i) + 1;
-			ma[index] = maf[i];
-			vm_page_flag_clear(ma[index], PG_CLEANCHK);
-		}
-		runlen = maxb + maxf + 1;
-
-		splx(s);
-		vm_pageout_flush(ma, runlen, pagerflags);
-		for (i = 0; i<runlen; i++) {
-			if (ma[i]->valid & ma[i]->dirty) {
-				vm_page_protect(ma[i], VM_PROT_READ);
-				vm_page_flag_set(ma[i], PG_CLEANCHK);
-			}
+	splx(s);
+	vm_pageout_flush(ma, runlen, pagerflags);
+	for (i = 0; i < runlen; i++) {
+		if (ma[i]->valid & ma[i]->dirty) {
+			vm_page_protect(ma[i], VM_PROT_READ);
+			vm_page_flag_set(ma[i], PG_CLEANCHK);
+
+			/*
+			 * maxf will end up being the actual number of pages
+			 * we wrote out contiguously, non-inclusive of the
+			 * first page.  We do not count look-behind pages.
+			 */
+			if (i >= maxb + 1 && (maxf > i - maxb - 1))
+				maxf = i - maxb - 1;
 		}
-		if (object->generation != curgeneration)
-			goto rescan;
 	}
-
-#if 0
-	VOP_FSYNC(vp, NULL, (pagerflags & VM_PAGER_PUT_SYNC)?MNT_WAIT:0, curproc);
-#endif
-
-	vm_object_clear_flag(object, OBJ_CLEANING);
-	return;
+	return(maxf + 1);
 }
 
 #ifdef not_used

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-hackers" in the body of the message