Date: Wed, 30 Sep 2015 23:06:30 +0000 (UTC) From: Mark Johnston <markj@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r288431 - in head/sys: kern sys vm Message-ID: <201509302306.t8UN6UwX043736@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: markj Date: Wed Sep 30 23:06:29 2015 New Revision: 288431 URL: https://svnweb.freebsd.org/changeset/base/288431 Log: As a step towards the elimination of PG_CACHED pages, rework the handling of POSIX_FADV_DONTNEED so that it causes the backing pages to be moved to the head of the inactive queue instead of being cached. This affects the implementation of POSIX_FADV_NOREUSE as well, since it works by applying POSIX_FADV_DONTNEED to file ranges after they have been read or written. At that point the corresponding buffers may still be dirty, so the previous implementation would coalesce successive ranges and apply POSIX_FADV_DONTNEED to the result, ensuring that pages backing the dirty buffers would eventually be cached. To preserve this behaviour in an efficient manner, this change adds a new buf flag, B_NOREUSE, which causes the pages backing a VMIO buf to be placed at the head of the inactive queue when the buf is released. POSIX_FADV_NOREUSE then works by setting this flag in bufs that underlie the specified range. Reviewed by: alc, kib Sponsored by: EMC / Isilon Storage Division Differential Revision: https://reviews.freebsd.org/D3726 Modified: head/sys/kern/vfs_bio.c head/sys/kern/vfs_default.c head/sys/kern/vfs_syscalls.c head/sys/kern/vfs_vnops.c head/sys/sys/buf.h head/sys/sys/file.h head/sys/vm/vm_object.c head/sys/vm/vm_object.h head/sys/vm/vm_page.c head/sys/vm/vm_page.h Modified: head/sys/kern/vfs_bio.c ============================================================================== --- head/sys/kern/vfs_bio.c Wed Sep 30 21:32:29 2015 (r288430) +++ head/sys/kern/vfs_bio.c Wed Sep 30 23:06:29 2015 (r288431) @@ -1785,6 +1785,8 @@ brelse(struct buf *bp) bp, bp->b_vp, bp->b_flags); KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); + KASSERT((bp->b_flags & B_VMIO) != 0 || (bp->b_flags & B_NOREUSE) == 0, + ("brelse: non-VMIO buffer marked NOREUSE")); if (BUF_LOCKRECURSED(bp)) { /* @@ -1873,8 +1875,10 @@ brelse(struct buf *bp) allocbuf(bp, 0); } - if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0) { + if ((bp->b_flags & (B_INVAL | B_RELBUF)) != 0 || + (bp->b_flags & (B_DELWRI | B_NOREUSE)) == B_NOREUSE) { allocbuf(bp, 0); + bp->b_flags &= ~B_NOREUSE; if (bp->b_vp != NULL) brelvp(bp); } @@ -1969,6 +1973,10 @@ bqrelse(struct buf *bp) if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY)) panic("bqrelse: not dirty"); + if ((bp->b_flags & B_NOREUSE) != 0) { + brelse(bp); + return; + } qindex = QUEUE_CLEAN; } binsfree(bp, qindex); @@ -2079,10 +2087,15 @@ vfs_vmio_unwire(struct buf *bp, vm_page_ freed = false; if (!freed) { /* - * In order to maintain LRU page ordering, put - * the page at the tail of the inactive queue. + * If the page is unlikely to be reused, let the + * VM know. Otherwise, maintain LRU page + * ordering and put the page at the tail of the + * inactive queue. */ - vm_page_deactivate(m); + if ((bp->b_flags & B_NOREUSE) != 0) + vm_page_deactivate_noreuse(m); + else + vm_page_deactivate(m); } } vm_page_unlock(m); @@ -2456,8 +2469,9 @@ getnewbuf_reuse_bp(struct buf *bp, int q * Note: we no longer distinguish between VMIO and non-VMIO * buffers. */ - KASSERT((bp->b_flags & B_DELWRI) == 0, - ("delwri buffer %p found in queue %d", bp, qindex)); + KASSERT((bp->b_flags & (B_DELWRI | B_NOREUSE)) == 0, + ("invalid buffer %p flags %#x found in queue %d", bp, bp->b_flags, + qindex)); /* * When recycling a clean buffer we have to truncate it and Modified: head/sys/kern/vfs_default.c ============================================================================== --- head/sys/kern/vfs_default.c Wed Sep 30 21:32:29 2015 (r288430) +++ head/sys/kern/vfs_default.c Wed Sep 30 23:06:29 2015 (r288431) @@ -1034,9 +1034,12 @@ vop_stdallocate(struct vop_allocate_args int vop_stdadvise(struct vop_advise_args *ap) { + struct buf *bp; + struct buflists *bl; struct vnode *vp; + daddr_t bn, startn, endn; off_t start, end; - int error; + int bsize, error; vp = ap->a_vp; switch (ap->a_advice) { @@ -1049,28 +1052,59 @@ vop_stdadvise(struct vop_advise_args *ap error = 0; break; case POSIX_FADV_DONTNEED: - /* - * Flush any open FS buffers and then remove pages - * from the backing VM object. Using vinvalbuf() here - * is a bit heavy-handed as it flushes all buffers for - * the given vnode, not just the buffers covering the - * requested range. - */ error = 0; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_iflag & VI_DOOMED) { VOP_UNLOCK(vp, 0); break; } - vinvalbuf(vp, V_CLEANONLY, 0, 0); + + /* + * Deactivate pages in the specified range from the backing VM + * object. Pages that are resident in the buffer cache will + * remain wired until their corresponding buffers are released + * below. + */ if (vp->v_object != NULL) { start = trunc_page(ap->a_start); end = round_page(ap->a_end); VM_OBJECT_WLOCK(vp->v_object); - vm_object_page_cache(vp->v_object, OFF_TO_IDX(start), + vm_object_page_noreuse(vp->v_object, OFF_TO_IDX(start), OFF_TO_IDX(end)); VM_OBJECT_WUNLOCK(vp->v_object); } + + BO_RLOCK(&vp->v_bufobj); + bsize = vp->v_bufobj.bo_bsize; + startn = ap->a_start / bsize; + if (ap->a_end == OFF_MAX) { + endn = -1; + bl = &vp->v_bufobj.bo_clean.bv_hd; + if (!TAILQ_EMPTY(bl)) + endn = TAILQ_LAST(bl, buflists)->b_lblkno; + bl = &vp->v_bufobj.bo_dirty.bv_hd; + if (!TAILQ_EMPTY(bl) && + endn < TAILQ_LAST(bl, buflists)->b_lblkno) + endn = TAILQ_LAST(bl, buflists)->b_lblkno; + } else + endn = ap->a_end / bsize; + BO_RUNLOCK(&vp->v_bufobj); + /* + * In the VMIO case, use the B_NOREUSE flag to hint that the + * pages backing each buffer in the range are unlikely to be + * reused. Dirty buffers will have the hint applied once + * they've been written. + */ + for (bn = startn; bn <= endn; bn++) { + bp = getblk(vp, bn, bsize, 0, 0, GB_NOCREAT | + GB_UNMAPPED); + if (bp == NULL) + continue; + bp->b_flags |= B_RELBUF; + if (vp->v_object != NULL) + bp->b_flags |= B_NOREUSE; + brelse(bp); + } VOP_UNLOCK(vp, 0); break; default: Modified: head/sys/kern/vfs_syscalls.c ============================================================================== --- head/sys/kern/vfs_syscalls.c Wed Sep 30 21:32:29 2015 (r288430) +++ head/sys/kern/vfs_syscalls.c Wed Sep 30 23:06:29 2015 (r288431) @@ -4610,8 +4610,6 @@ kern_posix_fadvise(struct thread *td, in new->fa_advice = advice; new->fa_start = offset; new->fa_end = end; - new->fa_prevstart = 0; - new->fa_prevend = 0; fp->f_advice = new; new = fa; } Modified: head/sys/kern/vfs_vnops.c ============================================================================== --- head/sys/kern/vfs_vnops.c Wed Sep 30 21:32:29 2015 (r288430) +++ head/sys/kern/vfs_vnops.c Wed Sep 30 23:06:29 2015 (r288431) @@ -770,10 +770,9 @@ vn_read(fp, uio, active_cred, flags, td) struct thread *td; { struct vnode *vp; - struct mtx *mtxp; + off_t orig_offset; int error, ioflag; int advice; - off_t offset, start, end; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); @@ -797,7 +796,7 @@ vn_read(fp, uio, active_cred, flags, td) /* Disable read-ahead for random I/O. */ break; } - offset = uio->uio_offset; + orig_offset = uio->uio_offset; #ifdef MAC error = mac_vnode_check_read(active_cred, fp->f_cred, vp); @@ -807,39 +806,14 @@ vn_read(fp, uio, active_cred, flags, td) fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0); if (error == 0 && advice == POSIX_FADV_NOREUSE && - offset != uio->uio_offset) { + orig_offset != uio->uio_offset) /* - * Use POSIX_FADV_DONTNEED to flush clean pages and - * buffers for the backing file after a - * POSIX_FADV_NOREUSE read(2). To optimize the common - * case of using POSIX_FADV_NOREUSE with sequential - * access, track the previous implicit DONTNEED - * request and grow this request to include the - * current read(2) in addition to the previous - * DONTNEED. With purely sequential access this will - * cause the DONTNEED requests to continously grow to - * cover all of the previously read regions of the - * file. This allows filesystem blocks that are - * accessed by multiple calls to read(2) to be flushed - * once the last read(2) finishes. + * Use POSIX_FADV_DONTNEED to flush pages and buffers + * for the backing file after a POSIX_FADV_NOREUSE + * read(2). */ - start = offset; - end = uio->uio_offset - 1; - mtxp = mtx_pool_find(mtxpool_sleep, fp); - mtx_lock(mtxp); - if (fp->f_advice != NULL && - fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) { - if (start != 0 && fp->f_advice->fa_prevend + 1 == start) - start = fp->f_advice->fa_prevstart; - else if (fp->f_advice->fa_prevstart != 0 && - fp->f_advice->fa_prevstart == end + 1) - end = fp->f_advice->fa_prevend; - fp->f_advice->fa_prevstart = start; - fp->f_advice->fa_prevend = end; - } - mtx_unlock(mtxp); - error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED); - } + error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, + POSIX_FADV_DONTNEED); return (error); } @@ -856,10 +830,9 @@ vn_write(fp, uio, active_cred, flags, td { struct vnode *vp; struct mount *mp; - struct mtx *mtxp; + off_t orig_offset; int error, ioflag, lock_flags; int advice; - off_t offset, start, end; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); @@ -902,7 +875,7 @@ vn_write(fp, uio, active_cred, flags, td /* XXX: Is this correct? */ break; } - offset = uio->uio_offset; + orig_offset = uio->uio_offset; #ifdef MAC error = mac_vnode_check_write(active_cred, fp->f_cred, vp); @@ -914,55 +887,14 @@ vn_write(fp, uio, active_cred, flags, td if (vp->v_type != VCHR) vn_finished_write(mp); if (error == 0 && advice == POSIX_FADV_NOREUSE && - offset != uio->uio_offset) { + orig_offset != uio->uio_offset) /* - * Use POSIX_FADV_DONTNEED to flush clean pages and - * buffers for the backing file after a - * POSIX_FADV_NOREUSE write(2). To optimize the - * common case of using POSIX_FADV_NOREUSE with - * sequential access, track the previous implicit - * DONTNEED request and grow this request to include - * the current write(2) in addition to the previous - * DONTNEED. With purely sequential access this will - * cause the DONTNEED requests to continously grow to - * cover all of the previously written regions of the - * file. - * - * Note that the blocks just written are almost - * certainly still dirty, so this only works when - * VOP_ADVISE() calls from subsequent writes push out - * the data written by this write(2) once the backing - * buffers are clean. However, as compared to forcing - * IO_DIRECT, this gives much saner behavior. Write - * clustering is still allowed, and clean pages are - * merely moved to the cache page queue rather than - * outright thrown away. This means a subsequent - * read(2) can still avoid hitting the disk if the - * pages have not been reclaimed. - * - * This does make POSIX_FADV_NOREUSE largely useless - * with non-sequential access. However, sequential - * access is the more common use case and the flag is - * merely advisory. + * Use POSIX_FADV_DONTNEED to flush pages and buffers + * for the backing file after a POSIX_FADV_NOREUSE + * write(2). */ - start = offset; - end = uio->uio_offset - 1; - mtxp = mtx_pool_find(mtxpool_sleep, fp); - mtx_lock(mtxp); - if (fp->f_advice != NULL && - fp->f_advice->fa_advice == POSIX_FADV_NOREUSE) { - if (start != 0 && fp->f_advice->fa_prevend + 1 == start) - start = fp->f_advice->fa_prevstart; - else if (fp->f_advice->fa_prevstart != 0 && - fp->f_advice->fa_prevstart == end + 1) - end = fp->f_advice->fa_prevend; - fp->f_advice->fa_prevstart = start; - fp->f_advice->fa_prevend = end; - } - mtx_unlock(mtxp); - error = VOP_ADVISE(vp, start, end, POSIX_FADV_DONTNEED); - } - + error = VOP_ADVISE(vp, orig_offset, uio->uio_offset - 1, + POSIX_FADV_DONTNEED); unlock: return (error); } Modified: head/sys/sys/buf.h ============================================================================== --- head/sys/sys/buf.h Wed Sep 30 21:32:29 2015 (r288430) +++ head/sys/sys/buf.h Wed Sep 30 23:06:29 2015 (r288431) @@ -204,7 +204,7 @@ struct buf { #define B_PERSISTENT 0x00000100 /* Perm. ref'ed while EXT2FS mounted. */ #define B_DONE 0x00000200 /* I/O completed. */ #define B_EINTR 0x00000400 /* I/O was interrupted */ -#define B_00000800 0x00000800 /* Available flag. */ +#define B_NOREUSE 0x00000800 /* Contents not reused once released. */ #define B_00001000 0x00001000 /* Available flag. */ #define B_INVAL 0x00002000 /* Does not contain valid info. */ #define B_BARRIER 0x00004000 /* Write this and all preceeding first. */ @@ -229,7 +229,7 @@ struct buf { #define PRINT_BUF_FLAGS "\20\40remfree\37cluster\36vmio\35ram\34managed" \ "\33paging\32infreecnt\31nocopy\30b23\27relbuf\26dirty\25b20" \ "\24b19\23b18\22clusterok\21malloc\20nocache\17b14\16inval" \ - "\15b12\14b11\13eintr\12done\11persist\10delwri" \ + "\15b12\14noreuse\13eintr\12done\11persist\10delwri" \ "\7validsuspwrt\6cache\5deferred\4direct\3async\2needcommit\1age" /* Modified: head/sys/sys/file.h ============================================================================== --- head/sys/sys/file.h Wed Sep 30 21:32:29 2015 (r288430) +++ head/sys/sys/file.h Wed Sep 30 23:06:29 2015 (r288431) @@ -160,8 +160,6 @@ struct fadvise_info { int fa_advice; /* (f) FADV_* type. */ off_t fa_start; /* (f) Region start. */ off_t fa_end; /* (f) Region end. */ - off_t fa_prevstart; /* (f) Previous NOREUSE start. */ - off_t fa_prevend; /* (f) Previous NOREUSE end. */ }; struct file { Modified: head/sys/vm/vm_object.c ============================================================================== --- head/sys/vm/vm_object.c Wed Sep 30 21:32:29 2015 (r288430) +++ head/sys/vm/vm_object.c Wed Sep 30 23:06:29 2015 (r288431) @@ -1963,15 +1963,15 @@ skipmemq: } /* - * vm_object_page_cache: + * vm_object_page_noreuse: * - * For the given object, attempt to move the specified clean - * pages to the cache queue. If a page is wired for any reason, - * then it will not be changed. Pages are specified by the given - * range ["start", "end"). As a special case, if "end" is zero, - * then the range extends from "start" to the end of the object. - * Any mappings to the specified pages are removed before the - * pages are moved to the cache queue. + * For the given object, attempt to move the specified pages to + * the head of the inactive queue. This bypasses regular LRU + * operation and allows the pages to be reused quickly under memory + * pressure. If a page is wired for any reason, then it will not + * be queued. Pages are specified by the range ["start", "end"). + * As a special case, if "end" is zero, then the range extends from + * "start" to the end of the object. * * This operation should only be performed on objects that * contain non-fictitious, managed pages. @@ -1979,14 +1979,14 @@ skipmemq: * The object must be locked. */ void -vm_object_page_cache(vm_object_t object, vm_pindex_t start, vm_pindex_t end) +vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, vm_pindex_t end) { struct mtx *mtx, *new_mtx; vm_page_t p, next; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & (OBJ_FICTITIOUS | OBJ_UNMANAGED)) == 0, - ("vm_object_page_cache: illegal object %p", object)); + ("vm_object_page_noreuse: illegal object %p", object)); if (object->resident_page_count == 0) return; p = vm_page_find_least(object, start); @@ -2009,7 +2009,7 @@ vm_object_page_cache(vm_object_t object, mtx = new_mtx; mtx_lock(mtx); } - vm_page_try_to_cache(p); + vm_page_deactivate_noreuse(p); } if (mtx != NULL) mtx_unlock(mtx); Modified: head/sys/vm/vm_object.h ============================================================================== --- head/sys/vm/vm_object.h Wed Sep 30 21:32:29 2015 (r288430) +++ head/sys/vm/vm_object.h Wed Sep 30 23:06:29 2015 (r288431) @@ -304,10 +304,10 @@ void vm_object_terminate (vm_object_t); void vm_object_set_writeable_dirty (vm_object_t); void vm_object_init (void); void vm_object_madvise(vm_object_t, vm_pindex_t, vm_pindex_t, int); -void vm_object_page_cache(vm_object_t object, vm_pindex_t start, - vm_pindex_t end); boolean_t vm_object_page_clean(vm_object_t object, vm_ooffset_t start, vm_ooffset_t end, int flags); +void vm_object_page_noreuse(vm_object_t object, vm_pindex_t start, + vm_pindex_t end); void vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options); boolean_t vm_object_populate(vm_object_t, vm_pindex_t, vm_pindex_t); Modified: head/sys/vm/vm_page.c ============================================================================== --- head/sys/vm/vm_page.c Wed Sep 30 21:32:29 2015 (r288430) +++ head/sys/vm/vm_page.c Wed Sep 30 23:06:29 2015 (r288431) @@ -2589,6 +2589,19 @@ vm_page_deactivate(vm_page_t m) } /* + * Move the specified page to the inactive queue with the expectation + * that it is unlikely to be reused. + * + * The page must be locked. + */ +void +vm_page_deactivate_noreuse(vm_page_t m) +{ + + _vm_page_deactivate(m, 1); +} + +/* * vm_page_try_to_cache: * * Returns 0 on failure, 1 on success @@ -2740,8 +2753,7 @@ vm_page_cache(vm_page_t m) /* * vm_page_advise * - * Deactivate or do nothing, as appropriate. This routine is used - * by madvise() and vop_stdadvise(). + * Deactivate or do nothing, as appropriate. * * The object and page must be locked. */ Modified: head/sys/vm/vm_page.h ============================================================================== --- head/sys/vm/vm_page.h Wed Sep 30 21:32:29 2015 (r288430) +++ head/sys/vm/vm_page.h Wed Sep 30 23:06:29 2015 (r288431) @@ -451,6 +451,7 @@ void vm_page_cache_transfer(vm_object_t, int vm_page_try_to_cache (vm_page_t); int vm_page_try_to_free (vm_page_t); void vm_page_deactivate (vm_page_t); +void vm_page_deactivate_noreuse(vm_page_t); void vm_page_dequeue(vm_page_t m); void vm_page_dequeue_locked(vm_page_t m); vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t);
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201509302306.t8UN6UwX043736>