Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 19 Oct 2016 11:09:29 +0000 (UTC)
From:      Konstantin Belousov <kib@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r307626 - head/sys/ufs/ffs
Message-ID:  <201610191109.u9JB9TTC002727@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: kib
Date: Wed Oct 19 11:09:29 2016
New Revision: 307626
URL: https://svnweb.freebsd.org/changeset/base/307626

Log:
  Add FFS pager, which uses buffer cache read operation to validate pages.
  See the comments for more detailed description of the algorithm.
  
  The pager is used unconditionally when the block size of the
  underlying device is larger than the machine page size, since local
  vnode pager cannot handle the configuration [1].  Otherwise, the
  vfs.ffs.use_buf_pager sysctl allows to switch to the local pager.
  
  Measurements demonstrated no regression in the ever-important
  buildworld benchmark, and small (~5%) throughput improvements in the
  special microbenchmark configuration for dbench over swap-backed
  md(4).
  
  Code can be generalized and reused for other filesystems which use
  buffer cache.
  
  Reported by:	Anton Yuzhaninov <citrin@citrin.ru> [1]
  Tested by:	pho
  Benchmarked by:	mjg, pho
  Reviewed by:	alc, markj, mckusick (previous version)
  Sponsored by:	The FreeBSD Foundation
  MFC after:	2 weeks
  Differential revision:	https://reviews.freebsd.org/D8198

Modified:
  head/sys/ufs/ffs/ffs_vnops.c

Modified: head/sys/ufs/ffs/ffs_vnops.c
==============================================================================
--- head/sys/ufs/ffs/ffs_vnops.c	Wed Oct 19 10:01:04 2016	(r307625)
+++ head/sys/ufs/ffs/ffs_vnops.c	Wed Oct 19 11:09:29 2016	(r307626)
@@ -77,6 +77,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/priv.h>
 #include <sys/rwlock.h>
 #include <sys/stat.h>
+#include <sys/sysctl.h>
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 
@@ -86,6 +87,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
+#include <vm/vm_pageout.h>
 #include <vm/vnode_pager.h>
 
 #include <ufs/ufs/extattr.h>
@@ -102,8 +104,9 @@ __FBSDID("$FreeBSD$");
 #ifdef DIRECTIO
 extern int	ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
 #endif
-static vop_fsync_t	ffs_fsync;
 static vop_fdatasync_t	ffs_fdatasync;
+static vop_fsync_t	ffs_fsync;
+static vop_getpages_t	ffs_getpages;
 static vop_lock1_t	ffs_lock;
 static vop_read_t	ffs_read;
 static vop_write_t	ffs_write;
@@ -119,13 +122,12 @@ static vop_openextattr_t	ffs_openextattr
 static vop_setextattr_t	ffs_setextattr;
 static vop_vptofh_t	ffs_vptofh;
 
-
 /* Global vfs data structures for ufs. */
 struct vop_vector ffs_vnodeops1 = {
 	.vop_default =		&ufs_vnodeops,
 	.vop_fsync =		ffs_fsync,
 	.vop_fdatasync =	ffs_fdatasync,
-	.vop_getpages =		vnode_pager_local_getpages,
+	.vop_getpages =		ffs_getpages,
 	.vop_getpages_async =	vnode_pager_local_getpages_async,
 	.vop_lock1 =		ffs_lock,
 	.vop_read =		ffs_read,
@@ -147,7 +149,7 @@ struct vop_vector ffs_vnodeops2 = {
 	.vop_default =		&ufs_vnodeops,
 	.vop_fsync =		ffs_fsync,
 	.vop_fdatasync =	ffs_fdatasync,
-	.vop_getpages =		vnode_pager_local_getpages,
+	.vop_getpages =		ffs_getpages,
 	.vop_getpages_async =	vnode_pager_local_getpages_async,
 	.vop_lock1 =		ffs_lock,
 	.vop_read =		ffs_read,
@@ -1784,3 +1786,165 @@ vop_vptofh {
 	ufhp->ufid_gen = ip->i_gen;
 	return (0);
 }
+
+SYSCTL_DECL(_vfs_ffs);
+static int use_buf_pager = 1;
+SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0,
+    "Always use buffer pager instead of bmap");
+static int buf_pager_relbuf;
+SYSCTL_INT(_vfs_ffs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN,
+    &buf_pager_relbuf, 0,
+    "Make buffer pager release buffers after reading");
+
+/*
+ * The FFS pager.  It uses buffer reads to validate pages.
+ *
+ * In contrast to the generic local pager from vm/vnode_pager.c, this
+ * pager correctly and easily handles volumes where the underlying
+ * device block size is greater than the machine page size.  The
+ * buffer cache transparently extends the requested page run to be
+ * aligned at the block boundary, and does the necessary bogus page
+ * replacements in the addends to avoid obliterating already valid
+ * pages.
+ *
+ * The only non-trivial issue is that the exclusive busy state for
+ * pages, which is assumed by the vm_pager_getpages() interface, is
+ * incompatible with the VMIO buffer cache's desire to share-busy the
+ * pages.  This function performs a trivial downgrade of the pages'
+ * state before reading buffers, and a less trivial upgrade from the
+ * shared-busy to excl-busy state after the read.
+ */
+static int
+ffs_getpages(struct vop_getpages_args *ap)
+{
+	struct vnode *vp;
+	vm_page_t *ma, m;
+	vm_object_t object;
+	struct buf *bp;
+	struct ufsmount *um;
+	ufs_lbn_t lbn, lbnp;
+	vm_ooffset_t la, lb;
+	long bsize;
+	int bo_bs, count, error, i;
+	bool redo, lpart;
+
+	vp = ap->a_vp;
+	ma = ap->a_m;
+	count = ap->a_count;
+
+	um = VFSTOUFS(ap->a_vp->v_mount);
+	bo_bs = um->um_devvp->v_bufobj.bo_bsize;
+	if (!use_buf_pager && bo_bs <= PAGE_SIZE)
+		return (vnode_pager_generic_getpages(vp, ma, count,
+		    ap->a_rbehind, ap->a_rahead, NULL, NULL));
+
+	object = vp->v_object;
+	la = IDX_TO_OFF(ma[count - 1]->pindex);
+	if (la >= object->un_pager.vnp.vnp_size)
+		return (VM_PAGER_BAD);
+	lpart = la + PAGE_SIZE > object->un_pager.vnp.vnp_size;
+	if (ap->a_rbehind != NULL) {
+		lb = IDX_TO_OFF(ma[0]->pindex);
+		*ap->a_rbehind = OFF_TO_IDX(lb - rounddown2(lb, bo_bs));
+	}
+	if (ap->a_rahead != NULL) {
+		*ap->a_rahead = OFF_TO_IDX(roundup2(la, bo_bs) - la);
+		if (la + IDX_TO_OFF(*ap->a_rahead) >=
+		    object->un_pager.vnp.vnp_size) {
+			*ap->a_rahead = OFF_TO_IDX(roundup2(object->un_pager.
+			    vnp.vnp_size, PAGE_SIZE) - la);
+		}
+	}
+	VM_OBJECT_WLOCK(object);
+again:
+	for (i = 0; i < count; i++)
+		vm_page_busy_downgrade(ma[i]);
+	VM_OBJECT_WUNLOCK(object);
+
+	lbnp = -1;
+	for (i = 0; i < count; i++) {
+		m = ma[i];
+
+		/*
+		 * Pages are shared busy and the object lock is not
+		 * owned, which together allow for the pages'
+		 * invalidation.  The racy test for validity avoids
+		 * useless creation of the buffer for the most typical
+		 * case when invalidation is not used in redo or for
+		 * parallel read.  The shared->excl upgrade loop at
+		 * the end of the function catches the race in a
+		 * reliable way (protected by the object lock).
+		 */
+		if (m->valid == VM_PAGE_BITS_ALL)
+			continue;
+
+		lbn = lblkno(um->um_fs, IDX_TO_OFF(m->pindex));
+		if (lbn != lbnp) {
+			bsize = blksize(um->um_fs, VTOI(vp), lbn);
+			error = bread_gb(vp, lbn, bsize, NOCRED, GB_UNMAPPED,
+			    &bp);
+			if (error != 0)
+				break;
+			KASSERT(1 /* racy, enable for debugging */ ||
+			    m->valid == VM_PAGE_BITS_ALL || i == count - 1,
+			    ("buf %d %p invalid", i, m));
+			if (i == count - 1 && lpart) {
+				VM_OBJECT_WLOCK(object);
+				if (m->valid != 0 &&
+				    m->valid != VM_PAGE_BITS_ALL)
+					vm_page_zero_invalid(m, TRUE);
+				VM_OBJECT_WUNLOCK(object);
+			}
+			if (LIST_EMPTY(&bp->b_dep)) {
+				/*
+				 * Invalidation clears m->valid, but
+				 * may leave B_CACHE flag if the
+				 * buffer existed at the invalidation
+				 * time.  In this case, recycle the
+				 * buffer to do real read on next
+				 * bread() after redo.
+				 *
+				 * Otherwise B_RELBUF is not strictly
+				 * necessary, enable to reduce buf
+				 * cache pressure.
+				 */
+				if (buf_pager_relbuf ||
+				    m->valid != VM_PAGE_BITS_ALL)
+					bp->b_flags |= B_RELBUF;
+
+				bp->b_flags &= ~B_NOCACHE;
+				brelse(bp);
+			} else {
+				bqrelse(bp);
+			}
+			lbnp = lbn;
+		}
+	}
+
+	VM_OBJECT_WLOCK(object);
+	redo = false;
+	for (i = 0; i < count; i++) {
+		vm_page_sunbusy(ma[i]);
+		ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL);
+
+		/*
+		 * Since the pages were only sbusy while neither the
+		 * buffer nor the object lock was held by us, or
+		 * reallocated while vm_page_grab() slept for busy
+		 * relinguish, they could have been invalidated.
+		 * Recheck the valid bits and re-read as needed.
+		 *
+		 * Note that the last page is made fully valid in the
+		 * read loop, and partial validity for the page at
+		 * index count - 1 could mean that the page was
+		 * invalidated or removed, so we must restart for
+		 * safety as well.
+		 */
+		if (ma[i]->valid != VM_PAGE_BITS_ALL)
+			redo = true;
+	}
+	if (redo && error == 0)
+		goto again;
+	VM_OBJECT_WUNLOCK(object);
+	return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK);
+}



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201610191109.u9JB9TTC002727>