Date: Sat, 12 Nov 2016 15:19:13 -0800 From: Adrian Chadd <adrian.chadd@gmail.com> To: Konstantin Belousov <kib@freebsd.org>, "freebsd-mips@freebsd.org" <freebsd-mips@freebsd.org> Cc: "src-committers@freebsd.org" <src-committers@freebsd.org>, "svn-src-all@freebsd.org" <svn-src-all@freebsd.org>, "svn-src-head@freebsd.org" <svn-src-head@freebsd.org> Subject: Re: svn commit: r307626 - head/sys/ufs/ffs Message-ID: <CAJ-Vmom5rYe89m7bch4qoHHq3X2e67pPk_7G2aRGrjSPNp5mzg@mail.gmail.com> In-Reply-To: <201610191109.u9JB9TTC002727@repo.freebsd.org> References: <201610191109.u9JB9TTC002727@repo.freebsd.org>
next in thread | previous in thread | raw e-mail | index | archive | help
hi! This broke freebsd on mips24k. BAD_PAGE_FAULT: pid 1 tid 100001 (init), uid 0: pc 0x4002a4 got a read fault (type 0x2) at 0 Trapframe Register Dump: zero: 0 at: 0 v0: 0 v1: 0 a0: 0x7fffeecc a1: 0 a2: 0 a3: 0 t0: 0 t1: 0 t2: 0 t3: 0 t4: 0 t5: 0 t6: 0 t7: 0 t8: 0 t9: 0x400260 s0: 0x10 s1: 0x2 s2: 0x7fffeed0 s3: 0 s4: 0 s5: 0 s6: 0 s7: 0 k0: 0 k1: 0 gp: 0x4d55d0 sp: 0x7ffeee90 s8: 0 ra: 0 sr: 0xfc13 mullo: 0 mulhi: 0 badvaddr: 0 cause: 0x8 pc: 0x4002a4 Page table info for pc address 0x4002a4: pde = 0x809be000, pte = 0xa001acda Dumping 4 words starting at pc address 0x4002a4: 8c420000 14400003 00908021 8f828024 Page table info for bad address 0: pde = 0, pte = 0 .. and yes, I've spent three days bisecting everything to get to this particular commit. -adrian On 19 October 2016 at 04:09, Konstantin Belousov <kib@freebsd.org> wrote: > Author: kib > Date: Wed Oct 19 11:09:29 2016 > New Revision: 307626 > URL: https://svnweb.freebsd.org/changeset/base/307626 > > Log: > Add FFS pager, which uses buffer cache read operation to validate pages. > See the comments for more detailed description of the algorithm. > > The pager is used unconditionally when the block size of the > underlying device is larger than the machine page size, since local > vnode pager cannot handle the configuration [1]. Otherwise, the > vfs.ffs.use_buf_pager sysctl allows to switch to the local pager. > > Measurements demonstrated no regression in the ever-important > buildworld benchmark, and small (~5%) throughput improvements in the > special microbenchmark configuration for dbench over swap-backed > md(4). > > Code can be generalized and reused for other filesystems which use > buffer cache. > > Reported by: Anton Yuzhaninov <citrin@citrin.ru> [1] > Tested by: pho > Benchmarked by: mjg, pho > Reviewed by: alc, markj, mckusick (previous version) > Sponsored by: The FreeBSD Foundation > MFC after: 2 weeks > Differential revision: https://reviews.freebsd.org/D8198 > > Modified: > head/sys/ufs/ffs/ffs_vnops.c > > Modified: head/sys/ufs/ffs/ffs_vnops.c > ============================================================================== > --- head/sys/ufs/ffs/ffs_vnops.c Wed Oct 19 10:01:04 2016 (r307625) > +++ head/sys/ufs/ffs/ffs_vnops.c Wed Oct 19 11:09:29 2016 (r307626) > @@ -77,6 +77,7 @@ __FBSDID("$FreeBSD$"); > #include <sys/priv.h> > #include <sys/rwlock.h> > #include <sys/stat.h> > +#include <sys/sysctl.h> > #include <sys/vmmeter.h> > #include <sys/vnode.h> > > @@ -86,6 +87,7 @@ __FBSDID("$FreeBSD$"); > #include <vm/vm_object.h> > #include <vm/vm_page.h> > #include <vm/vm_pager.h> > +#include <vm/vm_pageout.h> > #include <vm/vnode_pager.h> > > #include <ufs/ufs/extattr.h> > @@ -102,8 +104,9 @@ __FBSDID("$FreeBSD$"); > #ifdef DIRECTIO > extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); > #endif > -static vop_fsync_t ffs_fsync; > static vop_fdatasync_t ffs_fdatasync; > +static vop_fsync_t ffs_fsync; > +static vop_getpages_t ffs_getpages; > static vop_lock1_t ffs_lock; > static vop_read_t ffs_read; > static vop_write_t ffs_write; > @@ -119,13 +122,12 @@ static vop_openextattr_t ffs_openextattr > static vop_setextattr_t ffs_setextattr; > static vop_vptofh_t ffs_vptofh; > > - > /* Global vfs data structures for ufs. */ > struct vop_vector ffs_vnodeops1 = { > .vop_default = &ufs_vnodeops, > .vop_fsync = ffs_fsync, > .vop_fdatasync = ffs_fdatasync, > - .vop_getpages = vnode_pager_local_getpages, > + .vop_getpages = ffs_getpages, > .vop_getpages_async = vnode_pager_local_getpages_async, > .vop_lock1 = ffs_lock, > .vop_read = ffs_read, > @@ -147,7 +149,7 @@ struct vop_vector ffs_vnodeops2 = { > .vop_default = &ufs_vnodeops, > .vop_fsync = ffs_fsync, > .vop_fdatasync = ffs_fdatasync, > - .vop_getpages = vnode_pager_local_getpages, > + .vop_getpages = ffs_getpages, > .vop_getpages_async = vnode_pager_local_getpages_async, > .vop_lock1 = ffs_lock, > .vop_read = ffs_read, > @@ -1784,3 +1786,165 @@ vop_vptofh { > ufhp->ufid_gen = ip->i_gen; > return (0); > } > + > +SYSCTL_DECL(_vfs_ffs); > +static int use_buf_pager = 1; > +SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0, > + "Always use buffer pager instead of bmap"); > +static int buf_pager_relbuf; > +SYSCTL_INT(_vfs_ffs, OID_AUTO, buf_pager_relbuf, CTLFLAG_RWTUN, > + &buf_pager_relbuf, 0, > + "Make buffer pager release buffers after reading"); > + > +/* > + * The FFS pager. It uses buffer reads to validate pages. > + * > + * In contrast to the generic local pager from vm/vnode_pager.c, this > + * pager correctly and easily handles volumes where the underlying > + * device block size is greater than the machine page size. The > + * buffer cache transparently extends the requested page run to be > + * aligned at the block boundary, and does the necessary bogus page > + * replacements in the addends to avoid obliterating already valid > + * pages. > + * > + * The only non-trivial issue is that the exclusive busy state for > + * pages, which is assumed by the vm_pager_getpages() interface, is > + * incompatible with the VMIO buffer cache's desire to share-busy the > + * pages. This function performs a trivial downgrade of the pages' > + * state before reading buffers, and a less trivial upgrade from the > + * shared-busy to excl-busy state after the read. > + */ > +static int > +ffs_getpages(struct vop_getpages_args *ap) > +{ > + struct vnode *vp; > + vm_page_t *ma, m; > + vm_object_t object; > + struct buf *bp; > + struct ufsmount *um; > + ufs_lbn_t lbn, lbnp; > + vm_ooffset_t la, lb; > + long bsize; > + int bo_bs, count, error, i; > + bool redo, lpart; > + > + vp = ap->a_vp; > + ma = ap->a_m; > + count = ap->a_count; > + > + um = VFSTOUFS(ap->a_vp->v_mount); > + bo_bs = um->um_devvp->v_bufobj.bo_bsize; > + if (!use_buf_pager && bo_bs <= PAGE_SIZE) > + return (vnode_pager_generic_getpages(vp, ma, count, > + ap->a_rbehind, ap->a_rahead, NULL, NULL)); > + > + object = vp->v_object; > + la = IDX_TO_OFF(ma[count - 1]->pindex); > + if (la >= object->un_pager.vnp.vnp_size) > + return (VM_PAGER_BAD); > + lpart = la + PAGE_SIZE > object->un_pager.vnp.vnp_size; > + if (ap->a_rbehind != NULL) { > + lb = IDX_TO_OFF(ma[0]->pindex); > + *ap->a_rbehind = OFF_TO_IDX(lb - rounddown2(lb, bo_bs)); > + } > + if (ap->a_rahead != NULL) { > + *ap->a_rahead = OFF_TO_IDX(roundup2(la, bo_bs) - la); > + if (la + IDX_TO_OFF(*ap->a_rahead) >= > + object->un_pager.vnp.vnp_size) { > + *ap->a_rahead = OFF_TO_IDX(roundup2(object->un_pager. > + vnp.vnp_size, PAGE_SIZE) - la); > + } > + } > + VM_OBJECT_WLOCK(object); > +again: > + for (i = 0; i < count; i++) > + vm_page_busy_downgrade(ma[i]); > + VM_OBJECT_WUNLOCK(object); > + > + lbnp = -1; > + for (i = 0; i < count; i++) { > + m = ma[i]; > + > + /* > + * Pages are shared busy and the object lock is not > + * owned, which together allow for the pages' > + * invalidation. The racy test for validity avoids > + * useless creation of the buffer for the most typical > + * case when invalidation is not used in redo or for > + * parallel read. The shared->excl upgrade loop at > + * the end of the function catches the race in a > + * reliable way (protected by the object lock). > + */ > + if (m->valid == VM_PAGE_BITS_ALL) > + continue; > + > + lbn = lblkno(um->um_fs, IDX_TO_OFF(m->pindex)); > + if (lbn != lbnp) { > + bsize = blksize(um->um_fs, VTOI(vp), lbn); > + error = bread_gb(vp, lbn, bsize, NOCRED, GB_UNMAPPED, > + &bp); > + if (error != 0) > + break; > + KASSERT(1 /* racy, enable for debugging */ || > + m->valid == VM_PAGE_BITS_ALL || i == count - 1, > + ("buf %d %p invalid", i, m)); > + if (i == count - 1 && lpart) { > + VM_OBJECT_WLOCK(object); > + if (m->valid != 0 && > + m->valid != VM_PAGE_BITS_ALL) > + vm_page_zero_invalid(m, TRUE); > + VM_OBJECT_WUNLOCK(object); > + } > + if (LIST_EMPTY(&bp->b_dep)) { > + /* > + * Invalidation clears m->valid, but > + * may leave B_CACHE flag if the > + * buffer existed at the invalidation > + * time. In this case, recycle the > + * buffer to do real read on next > + * bread() after redo. > + * > + * Otherwise B_RELBUF is not strictly > + * necessary, enable to reduce buf > + * cache pressure. > + */ > + if (buf_pager_relbuf || > + m->valid != VM_PAGE_BITS_ALL) > + bp->b_flags |= B_RELBUF; > + > + bp->b_flags &= ~B_NOCACHE; > + brelse(bp); > + } else { > + bqrelse(bp); > + } > + lbnp = lbn; > + } > + } > + > + VM_OBJECT_WLOCK(object); > + redo = false; > + for (i = 0; i < count; i++) { > + vm_page_sunbusy(ma[i]); > + ma[i] = vm_page_grab(object, ma[i]->pindex, VM_ALLOC_NORMAL); > + > + /* > + * Since the pages were only sbusy while neither the > + * buffer nor the object lock was held by us, or > + * reallocated while vm_page_grab() slept for busy > + * relinguish, they could have been invalidated. > + * Recheck the valid bits and re-read as needed. > + * > + * Note that the last page is made fully valid in the > + * read loop, and partial validity for the page at > + * index count - 1 could mean that the page was > + * invalidated or removed, so we must restart for > + * safety as well. > + */ > + if (ma[i]->valid != VM_PAGE_BITS_ALL) > + redo = true; > + } > + if (redo && error == 0) > + goto again; > + VM_OBJECT_WUNLOCK(object); > + return (error != 0 ? VM_PAGER_ERROR : VM_PAGER_OK); > +} >
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?CAJ-Vmom5rYe89m7bch4qoHHq3X2e67pPk_7G2aRGrjSPNp5mzg>