From owner-freebsd-hackers Thu Jul 29 11:56:59 1999 Delivered-To: freebsd-hackers@freebsd.org Received: from apollo.backplane.com (apollo.backplane.com [209.157.86.2]) by hub.freebsd.org (Postfix) with ESMTP id 8BFB71562F for ; Thu, 29 Jul 1999 11:56:16 -0700 (PDT) (envelope-from dillon@apollo.backplane.com) Received: (from dillon@localhost) by apollo.backplane.com (8.9.3/8.9.1) id LAA77471; Thu, 29 Jul 1999 11:56:07 -0700 (PDT) (envelope-from dillon) Date: Thu, 29 Jul 1999 11:56:07 -0700 (PDT) From: Matthew Dillon Message-Id: <199907291856.LAA77471@apollo.backplane.com> To: Alan Cox , David Greenman Cc: hackers@FreeBSD.ORG Subject: patch for behavior changes and madvise MADV_DONTNEED References: <199907162234.PAA21850@apollo.backplane.com> <19990720014804.A21777@cs.rice.edu> Sender: owner-freebsd-hackers@FreeBSD.ORG Precedence: bulk X-Loop: FreeBSD.ORG I have tested this on both small and large files and it appears to work extremely well. So well, in fact, that I can have a program which mmap()'s a large file and continuously scans it - generating 4MB/sec of network traffic, with virtually no effect to the rest of the system. I am not finished testing. I will be running buildworlds and other related tests overnight to ensure that no previously fixed bugs have been reintroduced. This patch will probably become the commit candidate tomorrow. There are several things in this patch: * minor readability fix in pmap.c * vm_page_undirty() * madvise() has, in general, been extended to operate on files (except for MADV_FREE) * madvise(... MADV_DONTNEED) has been implemented to avoid starving the VM page queues while at the same time enforcing a slow balancing to deal with both the small-file and the large-file case. If we wanted to we could further optimize this code by modifying vm_page_dontneed(). For example, we could have it ignore pages whos act_count are too large. * #if 0'ing out of apparently unnecessary ufs code (if it winds up being necessary I recommend removing it anyway and making the required changes to vm_fault.c instead). -Matt Matthew Dillon Index: i386/i386/pmap.c =================================================================== RCS file: /home/ncvs/src/sys/i386/i386/pmap.c,v retrieving revision 1.242 diff -u -r1.242 pmap.c --- pmap.c 1999/07/21 18:01:40 1.242 +++ pmap.c 1999/07/21 20:43:00 @@ -3188,7 +3188,7 @@ pte = pmap_pte_quick(pv->pv_pmap, pv->pv_va); - if (pte && *pte & PG_A) { + if (pte && (*pte & PG_A)) { *pte &= ~PG_A; pmap_TLB_invalidate(pv->pv_pmap, pv->pv_va); Index: miscfs/devfs/devfs_vnops.c =================================================================== RCS file: /home/ncvs/src/sys/miscfs/devfs/devfs_vnops.c,v retrieving revision 1.75 diff -u -r1.75 devfs_vnops.c --- devfs_vnops.c 1999/06/26 02:46:17 1.75 +++ devfs_vnops.c 1999/07/08 22:20:29 @@ -2005,13 +2005,13 @@ if (nextoff <= nread) { m->valid = VM_PAGE_BITS_ALL; - m->dirty = 0; + vm_page_undirty(m); } else if (toff < nread) { int nvalid = ((nread + DEV_BSIZE - 1) - toff) & ~(DEV_BSIZE - 1); vm_page_set_validclean(m, 0, nvalid); } else { m->valid = 0; - m->dirty = 0; + vm_page_undirty(m); } if (i != ap->a_reqpage) { Index: miscfs/specfs/spec_vnops.c =================================================================== RCS file: /home/ncvs/src/sys/miscfs/specfs/spec_vnops.c,v retrieving revision 1.90 diff -u -r1.90 spec_vnops.c --- spec_vnops.c 1999/07/20 09:47:45 1.90 +++ spec_vnops.c 1999/07/21 05:50:06 @@ -860,7 +860,7 @@ if (nextoff <= nread) { m->valid = VM_PAGE_BITS_ALL; - m->dirty = 0; + vm_page_undirty(m); } else if (toff < nread) { /* * Since this is a VM request, we have to supply the @@ -870,7 +870,7 @@ vm_page_set_validclean(m, 0, nread - toff); } else { m->valid = 0; - m->dirty = 0; + vm_page_undirty(m); } if (i != ap->a_reqpage) { Index: nfs/nfs_bio.c =================================================================== RCS file: /home/ncvs/src/sys/nfs/nfs_bio.c,v retrieving revision 1.74 diff -u -r1.74 nfs_bio.c --- nfs_bio.c 1999/06/26 02:46:29 1.74 +++ nfs_bio.c 1999/07/08 22:21:48 @@ -185,7 +185,7 @@ * Read operation filled an entire page */ m->valid = VM_PAGE_BITS_ALL; - m->dirty = 0; + vm_page_undirty(m); } else if (size > toff) { /* * Read operation filled a partial page. @@ -313,7 +313,7 @@ int nwritten = round_page(count - uio.uio_resid) / PAGE_SIZE; for (i = 0; i < nwritten; i++) { rtvals[i] = VM_PAGER_OK; - pages[i]->dirty = 0; + vm_page_undirty(pages[i]); } if (must_commit) nfs_clearcommit(vp->v_mount); Index: ufs/ufs/ufs_readwrite.c =================================================================== RCS file: /home/ncvs/src/sys/ufs/ufs/ufs_readwrite.c,v retrieving revision 1.61 diff -u -r1.61 ufs_readwrite.c --- ufs_readwrite.c 1999/07/25 02:07:16 1.61 +++ ufs_readwrite.c 1999/07/29 17:40:14 @@ -591,6 +591,7 @@ if (firstindex == 0) vp->v_lastr = 0; +#if 0 if (((obj->behavior != OBJ_RANDOM) && (firstindex != 0) && (firstindex <= vp->v_lastr) && ((firstindex + pcount) > vp->v_lastr)) || @@ -652,6 +653,7 @@ vm_page_zero_invalid(mreq, TRUE); return VM_PAGER_OK; } +#endif /* * foff is the file offset of the required page @@ -670,7 +672,7 @@ if (reqblkno == -1) { if ((mreq->flags & PG_ZERO) == 0) vm_page_zero_fill(mreq); - mreq->dirty = 0; + vm_page_undirty(mreq); mreq->valid = VM_PAGE_BITS_ALL; return VM_PAGER_OK; } else { Index: vm/swap_pager.c =================================================================== RCS file: /home/ncvs/src/sys/vm/swap_pager.c,v retrieving revision 1.121 diff -u -r1.121 swap_pager.c --- swap_pager.c 1999/07/16 05:11:35 1.121 +++ swap_pager.c 1999/07/29 18:24:33 @@ -1631,7 +1631,7 @@ pmap_clear_modify(VM_PAGE_TO_PHYS(m)); m->valid = VM_PAGE_BITS_ALL; - m->dirty = 0; + vm_page_undirty(m); vm_page_flag_clear(m, PG_ZERO); /* @@ -1656,7 +1656,7 @@ */ vm_page_protect(m, VM_PROT_READ); pmap_clear_modify(VM_PAGE_TO_PHYS(m)); - m->dirty = 0; + vm_page_undirty(m); vm_page_io_finish(m); } } Index: vm/vm_fault.c =================================================================== RCS file: /home/ncvs/src/sys/vm/vm_fault.c,v retrieving revision 1.103 diff -u -r1.103 vm_fault.c --- vm_fault.c 1999/07/20 05:46:56 1.103 +++ vm_fault.c 1999/07/29 17:43:07 @@ -386,7 +386,7 @@ int reqpage; int ahead, behind; - if (fs.first_object->behavior == OBJ_RANDOM) { + if (fs.entry->behavior == BEHAV_RANDOM) { ahead = 0; behind = 0; } else { @@ -400,7 +400,7 @@ } if ((fs.first_object->type != OBJT_DEVICE) && - (fs.first_object->behavior == OBJ_SEQUENTIAL)) { + (fs.entry->behavior == BEHAV_SEQUENTIAL)) { vm_pindex_t firstpindex, tmppindex; if (fs.first_pindex < 2*(VM_FAULT_READ_BEHIND + VM_FAULT_READ_AHEAD + 1)) Index: vm/vm_map.c =================================================================== RCS file: /home/ncvs/src/sys/vm/vm_map.c,v retrieving revision 1.173 diff -u -r1.173 vm_map.c --- vm_map.c 1999/07/21 18:02:27 1.173 +++ vm_map.c 1999/07/29 17:44:43 @@ -1051,13 +1051,13 @@ switch (advise) { case MADV_NORMAL: - current->object.vm_object->behavior = OBJ_NORMAL; + current->behavior = BEHAV_NORMAL; break; case MADV_SEQUENTIAL: - current->object.vm_object->behavior = OBJ_SEQUENTIAL; + current->behavior = BEHAV_SEQUENTIAL; break; case MADV_RANDOM: - current->object.vm_object->behavior = OBJ_RANDOM; + current->behavior = BEHAV_RANDOM; break; /* * Right now, we could handle DONTNEED and WILLNEED with common code. Index: vm/vm_map.h =================================================================== RCS file: /home/ncvs/src/sys/vm/vm_map.h,v retrieving revision 1.43 diff -u -r1.43 vm_map.h --- vm_map.h 1999/07/10 18:16:08 1.43 +++ vm_map.h 1999/07/29 17:41:46 @@ -89,6 +89,10 @@ struct vm_map *sub_map; /* belongs to another map */ }; +#define BEHAV_NORMAL 0x0 /* default behavior */ +#define BEHAV_SEQUENTIAL 0x1 /* expect sequential accesses */ +#define BEHAV_RANDOM 0x2 /* expect random accesses */ + /* * Address map entries consist of start and end addresses, * a VM object (or sharing map) and offset into that object, @@ -102,6 +106,8 @@ vm_offset_t end; /* end address */ vm_offset_t avail_ssize; /* amt can grow if this is a stack */ union vm_map_object object; /* object I point to */ + u_short behavior; /* fault behavior */ + u_short unused3; /* (filler) */ vm_ooffset_t offset; /* offset into object */ u_char eflags; /* map entry flags */ /* Only in task maps: */ Index: vm/vm_object.c =================================================================== RCS file: /home/ncvs/src/sys/vm/vm_object.c,v retrieving revision 1.160 diff -u -r1.160 vm_object.c --- vm_object.c 1999/07/16 05:11:36 1.160 +++ vm_object.c 1999/07/29 17:19:05 @@ -154,7 +154,9 @@ object->flags = 0; if ((object->type == OBJT_DEFAULT) || (object->type == OBJT_SWAP)) vm_object_set_flag(object, OBJ_ONEMAPPING); +#if 0 object->behavior = OBJ_NORMAL; +#endif object->paging_in_progress = 0; object->resident_page_count = 0; object->shadow_count = 0; @@ -735,12 +737,22 @@ * vm_object_madvise: * * Implements the madvise function at the object/page level. + * + * MADV_WILLNEED (any map) + * + * Force activation of the page if it is found in-core + * + * MADV_DONTNEED (any map) + * + * Deactivate or cache the page as appropriate. * - * Currently, madvise() functions are limited to the default and - * swap object types only, and also limited to only the unshared portions - * of a process's address space. MADV_FREE, certainly, could never be - * run on anything else. The others are more flexible and the code could - * be adjusted in the future to handle expanded cases for them. + * MADV_FREE (OBJT_DEFAULT or OBJT_SWAP maps, OBJ_ONEMAPPING only) + * + * essentially free the underlying storage. We mark the storage + * clean but do not unmap it from the process, allowing the process + * to reuse the storage (by dirtying it again) as well as allowing + * the VM system to reuse it for other purpose, turning it back into + * zero-fill. */ void vm_object_madvise(object, pindex, count, advise) @@ -768,20 +780,26 @@ tpindex = pindex; shadowlookup: - if (tobject->type != OBJT_DEFAULT && - tobject->type != OBJT_SWAP - ) { - continue; - } + /* + * MADV_FREE only operates OBJT_DEFAULT or OBJT_SWAP pages + * and those pages must be OBJ_ONEMAPPING. + */ - if ((tobject->flags & OBJ_ONEMAPPING) == 0) - continue; + if (advise == MADV_FREE) { + if ((tobject->type != OBJT_DEFAULT && + tobject->type != OBJT_SWAP) || + (tobject->type & OBJ_ONEMAPPING) == 0 + ) { + continue; + } + } m = vm_page_lookup(tobject, tpindex); if (m == NULL) { /* - * There may be swap even if there is no backing page + * There may be swap in an intermediate object even + * if there is no backing page, deal with it here. */ if (advise == MADV_FREE && tobject->type == OBJT_SWAP) swap_pager_freespace(tobject, tpindex, 1); @@ -813,9 +831,17 @@ goto relookup; if (advise == MADV_WILLNEED) { + /* + * Activate the page early to reduce the chance of + * it being reused before the program accesses it. + */ vm_page_activate(m); } else if (advise == MADV_DONTNEED) { - vm_page_deactivate(m); + /* + * Deactivate, cache, or do nothing to the page + * as appropriate. + */ + vm_page_dontneed(m); } else if (advise == MADV_FREE) { /* * Mark the page clean. This will allow the page @@ -833,7 +859,7 @@ * it. */ pmap_clear_modify(VM_PAGE_TO_PHYS(m)); - m->dirty = 0; + vm_page_undirty(m); m->act_count = 0; vm_page_deactivate(m); if (tobject->type == OBJT_SWAP) Index: vm/vm_object.h =================================================================== RCS file: /home/ncvs/src/sys/vm/vm_object.h,v retrieving revision 1.58 diff -u -r1.58 vm_object.h --- vm_object.h 1999/07/16 05:11:37 1.58 +++ vm_object.h 1999/07/29 17:13:33 @@ -98,7 +98,7 @@ u_short flags; /* see below */ u_short pg_color; /* color of first page in obj */ u_short paging_in_progress; /* Paging (in or out) so don't collapse or destroy */ - u_short behavior; /* see below */ + u_short unused13; int resident_page_count; /* number of resident pages */ struct vm_object *backing_object; /* object that I'm a shadow of */ vm_ooffset_t backing_object_offset;/* Offset in backing object */ @@ -148,10 +148,6 @@ #define OBJ_CLEANING 0x0200 #define OBJ_OPT 0x1000 /* I/O optimization */ #define OBJ_ONEMAPPING 0x2000 /* One USE (a single, non-forked) mapping flag */ - -#define OBJ_NORMAL 0x0 /* default behavior */ -#define OBJ_SEQUENTIAL 0x1 /* expect sequential accesses */ -#define OBJ_RANDOM 0x2 /* expect random accesses */ #define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT) #define OFF_TO_IDX(off) ((vm_pindex_t)(((vm_ooffset_t)(off)) >> PAGE_SHIFT)) Index: vm/vm_page.c =================================================================== RCS file: /home/ncvs/src/sys/vm/vm_page.c,v retrieving revision 1.134 diff -u -r1.134 vm_page.c --- vm_page.c 1999/07/01 19:53:42 1.134 +++ vm_page.c 1999/07/29 18:29:35 @@ -862,6 +862,10 @@ m->busy = 0; m->valid = 0; m->dirty = 0; +#if 0 + /* FUTURE */ + KASSERT(m->dirty == 0, ("vm_page_alloc: free/cache page %p was dirty", m)); +#endif m->queue = PQ_NONE; /* @@ -997,6 +1001,8 @@ * vm_page_activate: * * Put the specified page on the active list (if appropriate). + * Ensure that act_count is at least ACT_INIT but do not otherwise + * mess with it. * * The page queues must be locked. * This routine may not block. @@ -1119,6 +1125,7 @@ } m->valid = 0; + vm_page_undirty(m); if (m->wire_count != 0) { #if !defined(MAX_PERF) @@ -1347,6 +1354,67 @@ } /* + * vm_page_dontneed + * + * Cache, deactivate, or do nothing as appropriate. This routine + * is typically used by madvise() MADV_DONTNEED. + * + * Generally speaking we want to move the page into the cache so + * it gets reused quickly. However, this can result in a silly syndrome + * due to the page recycling too quickly. Small objects will not be + * fully cached. On the otherhand, if we move the page to the inactive + * queue we wind up with a problem whereby very large objects + * unnecessarily blow away our inactive and cache queues. + * + * The solution is to move the pages based on a fixed weighting. We + * either leave them alone, deactivate them, or move them to the cache, + * where moving them to the cache has the highest weighting. + * By forcing some pages into other queues we eventually force the + * system to balance the queues, potentially recovering other unrelated + * space from active. The idea is to not force this to happen too + * often. + */ + +void +vm_page_dontneed(m) + vm_page_t m; +{ + static int dnweight; + int dnw; + + dnw = ++dnweight; + + /* + * Just adjust act_count and do not otherwise mess with the page + * if it is already on the inactive or cache queues, and for one + * page out of every 32. + */ + + if ((dnw & 31) == 0 || + m->queue == PQ_INACTIVE || + m->queue - m->pc != PQ_CACHE + ) { + if (m->act_count > 0) + --m->act_count; + return; + } + + vm_page_test_dirty(m); + + if ((dnw & 7) == 0 || m->dirty) { + /* + * Deactivate the page 3 times out of 32. + */ + vm_page_deactivate(m); + } else { + /* + * Cache the page 28 times out of every 32. + */ + vm_page_cache(m); + } +} + +/* * Grab a page, waiting until we are waken up due to the page * changing state. We keep on waiting, if the page continues * to be in the object. If the page doesn't exist, allocate it. @@ -1778,6 +1846,10 @@ m->valid = VM_PAGE_BITS_ALL; m->flags = 0; m->dirty = 0; +#if 0 + /* future */ + KASSERT(m->dirty == 0, ("ctgmalloc1: page %p was dirty", m)); +#endif m->wire_count = 0; m->busy = 0; m->queue = PQ_NONE; Index: vm/vm_page.h =================================================================== RCS file: /home/ncvs/src/sys/vm/vm_page.h,v retrieving revision 1.63 diff -u -r1.63 vm_page.h --- vm_page.h 1999/07/22 06:04:17 1.63 +++ vm_page.h 1999/07/29 06:38:23 @@ -376,6 +376,7 @@ vm_page_t vm_page_alloc __P((vm_object_t, vm_pindex_t, int)); vm_page_t vm_page_grab __P((vm_object_t, vm_pindex_t, int)); void vm_page_cache __P((register vm_page_t)); +void vm_page_dontneed __P((register vm_page_t)); static __inline void vm_page_copy __P((vm_page_t, vm_page_t)); static __inline void vm_page_free __P((vm_page_t)); static __inline void vm_page_free_zero __P((vm_page_t)); @@ -555,6 +556,18 @@ { KASSERT(m->queue - m->pc != PQ_CACHE, ("vm_page_dirty: page in cache!")); m->dirty = VM_PAGE_BITS_ALL; +} + +/* + * vm_page_undirty: + * + * Set page to not be dirty. Note: does not clear pmap modify bits + */ + +static __inline void +vm_page_undirty(vm_page_t m) +{ + m->dirty = 0; } static __inline vm_page_t Index: vm/vm_pageout.c =================================================================== RCS file: /home/ncvs/src/sys/vm/vm_pageout.c,v retrieving revision 1.144 diff -u -r1.144 vm_pageout.c --- vm_pageout.c 1999/07/04 00:25:37 1.144 +++ vm_pageout.c 1999/07/13 05:47:33 @@ -425,7 +425,7 @@ * worked. */ pmap_clear_modify(VM_PAGE_TO_PHYS(mt)); - mt->dirty = 0; + vm_page_undirty(mt); break; case VM_PAGER_ERROR: case VM_PAGER_FAIL: Index: vm/vnode_pager.c =================================================================== RCS file: /home/ncvs/src/sys/vm/vnode_pager.c,v retrieving revision 1.112 diff -u -r1.112 vnode_pager.c --- vnode_pager.c 1999/07/01 19:53:43 1.112 +++ vnode_pager.c 1999/07/08 22:27:33 @@ -511,7 +511,7 @@ vm_pager_unmap_page(kva); } pmap_clear_modify(VM_PAGE_TO_PHYS(m)); - m->dirty = 0; + vm_page_undirty(m); vm_page_flag_clear(m, PG_ZERO); if (!error) m->valid = VM_PAGE_BITS_ALL; @@ -773,7 +773,7 @@ * Read filled up entire page. */ mt->valid = VM_PAGE_BITS_ALL; - mt->dirty = 0; + vm_page_undirty(mt); /* should be an assert? XXX */ pmap_clear_modify(VM_PAGE_TO_PHYS(mt)); } else { /* To Unsubscribe: send mail to majordomo@FreeBSD.org with "unsubscribe freebsd-hackers" in the body of the message