Date: Thu, 23 Feb 2012 21:07:16 +0000 (UTC) From: Konstantin Belousov <kib@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r232071 - head/sys/vm Message-ID: <201202232107.q1NL7GHi023139@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: kib Date: Thu Feb 23 21:07:16 2012 New Revision: 232071 URL: http://svn.freebsd.org/changeset/base/232071 Log: Account the writeable shared mappings backed by file in the vnode v_writecount. Keep the amount of the virtual address space used by the mappings in the new vm_object un_pager.vnp.writemappings counter. The vnode v_writecount is incremented when writemappings gets non-zero value, and decremented when writemappings is returned to zero. Writeable shared vnode-backed mappings are accounted for in vm_mmap(), and vm_map_insert() is instructed to set MAP_ENTRY_VN_WRITECNT flag on the created map entry. During deferred map entry deallocation, vm_map_process_deferred() checks for MAP_ENTRY_VN_WRITECOUNT and decrements writemappings for the vm object. Now, the writeable mount cannot be demoted to read-only while writeable shared mappings of the vnodes from the mount point exist. Also, execve(2) fails for such files with ETXTBUSY, as it should be. Noted by: tegge Reviewed by: tegge (long time ago, early version), alc Tested by: pho MFC after: 3 weeks Modified: head/sys/vm/vm_map.c head/sys/vm/vm_map.h head/sys/vm/vm_mmap.c head/sys/vm/vm_object.h head/sys/vm/vnode_pager.c head/sys/vm/vnode_pager.h Modified: head/sys/vm/vm_map.c ============================================================================== --- head/sys/vm/vm_map.c Thu Feb 23 20:58:52 2012 (r232070) +++ head/sys/vm/vm_map.c Thu Feb 23 21:07:16 2012 (r232071) @@ -91,6 +91,7 @@ __FBSDID("$FreeBSD$"); #include <vm/vm_pager.h> #include <vm/vm_kern.h> #include <vm/vm_extern.h> +#include <vm/vnode_pager.h> #include <vm/swap_pager.h> #include <vm/uma.h> @@ -475,11 +476,23 @@ vm_map_process_deferred(void) { struct thread *td; vm_map_entry_t entry; + vm_object_t object; td = curthread; - while ((entry = td->td_map_def_user) != NULL) { td->td_map_def_user = entry->next; + if ((entry->eflags & MAP_ENTRY_VN_WRITECNT) != 0) { + /* + * Decrement the object's writemappings and + * possibly the vnode's v_writecount. + */ + KASSERT((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0, + ("Submap with writecount")); + object = entry->object.vm_object; + KASSERT(object != NULL, ("No object for writecount")); + vnode_pager_release_writecount(object, entry->start, + entry->end); + } vm_map_entry_deallocate(entry, FALSE); } } @@ -1174,6 +1187,8 @@ vm_map_insert(vm_map_t map, vm_object_t protoeflags |= MAP_ENTRY_NOSYNC; if (cow & MAP_DISABLE_COREDUMP) protoeflags |= MAP_ENTRY_NOCOREDUMP; + if (cow & MAP_VN_WRITECOUNT) + protoeflags |= MAP_ENTRY_VN_WRITECNT; if (cow & MAP_INHERIT_SHARE) inheritance = VM_INHERIT_SHARE; else @@ -1516,6 +1531,11 @@ vm_map_simplify_entry(vm_map_t map, vm_m * references. Thus, the map lock can be kept * without causing a lock-order reversal with * the vnode lock. + * + * Since we count the number of virtual page + * mappings in object->un_pager.vnp.writemappings, + * the writemappings value should not be adjusted + * when the entry is disposed of. */ if (prev->object.vm_object) vm_object_deallocate(prev->object.vm_object); @@ -1627,6 +1647,13 @@ _vm_map_clip_start(vm_map_t map, vm_map_ if ((entry->eflags & MAP_ENTRY_IS_SUB_MAP) == 0) { vm_object_reference(new_entry->object.vm_object); + /* + * The object->un_pager.vnp.writemappings for the + * object of MAP_ENTRY_VN_WRITECNT type entry shall be + * kept as is here. The virtual pages are + * re-distributed among the clipped entries, so the sum is + * left the same. + */ } } @@ -2900,6 +2927,7 @@ vm_map_copy_entry( vm_ooffset_t *fork_charge) { vm_object_t src_object; + vm_map_entry_t fake_entry; vm_offset_t size; struct ucred *cred; int charged; @@ -2965,6 +2993,27 @@ vm_map_copy_entry( src_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY); dst_entry->eflags |= (MAP_ENTRY_COW|MAP_ENTRY_NEEDS_COPY); dst_entry->offset = src_entry->offset; + if (src_entry->eflags & MAP_ENTRY_VN_WRITECNT) { + /* + * MAP_ENTRY_VN_WRITECNT cannot + * indicate write reference from + * src_entry, since the entry is + * marked as needs copy. Allocate a + * fake entry that is used to + * decrement object->un_pager.vnp.writecount + * at the appropriate time. Attach + * fake_entry to the deferred list. + */ + fake_entry = vm_map_entry_create(dst_map); + fake_entry->eflags = MAP_ENTRY_VN_WRITECNT; + src_entry->eflags &= ~MAP_ENTRY_VN_WRITECNT; + vm_object_reference(src_object); + fake_entry->object.vm_object = src_object; + fake_entry->start = src_entry->start; + fake_entry->end = src_entry->end; + fake_entry->next = curthread->td_map_def_user; + curthread->td_map_def_user = fake_entry; + } } else { dst_entry->object.vm_object = NULL; dst_entry->offset = 0; @@ -3043,6 +3092,7 @@ vmspace_fork(struct vmspace *vm1, vm_oof vm_map_lock(old_map); if (old_map->busy) vm_map_wait_busy(old_map); + new_map = NULL; /* silence gcc */ vm2 = vmspace_alloc(old_map->min_offset, old_map->max_offset); if (vm2 == NULL) goto unlock_and_return; @@ -3122,6 +3172,16 @@ vmspace_fork(struct vmspace *vm1, vm_oof new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | MAP_ENTRY_IN_TRANSITION); new_entry->wired_count = 0; + if (new_entry->eflags & MAP_ENTRY_VN_WRITECNT) { + object = new_entry->object.vm_object; + KASSERT(((struct vnode *)object->handle)-> + v_writecount > 0, + ("vmspace_fork: v_writecount")); + KASSERT(object->un_pager.vnp.writemappings > 0, + ("vmspace_fork: vnp.writecount")); + vnode_pager_update_writecount(object, + new_entry->start, new_entry->end); + } /* * Insert the entry into the new map -- we know we're @@ -3146,8 +3206,11 @@ vmspace_fork(struct vmspace *vm1, vm_oof */ new_entry = vm_map_entry_create(new_map); *new_entry = *old_entry; + /* + * Copied entry is COW over the old object. + */ new_entry->eflags &= ~(MAP_ENTRY_USER_WIRED | - MAP_ENTRY_IN_TRANSITION); + MAP_ENTRY_IN_TRANSITION | MAP_ENTRY_VN_WRITECNT); new_entry->wired_count = 0; new_entry->object.vm_object = NULL; new_entry->cred = NULL; @@ -3161,9 +3224,15 @@ vmspace_fork(struct vmspace *vm1, vm_oof old_entry = old_entry->next; } unlock_and_return: - vm_map_unlock(old_map); + /* + * Use inlined vm_map_unlock() to postpone handling the deferred + * map entries, which cannot be done until both old_map and + * new_map locks are released. + */ + sx_xunlock(&old_map->lock); if (vm2 != NULL) - vm_map_unlock(new_map); + sx_xunlock(&new_map->lock); + vm_map_process_deferred(); return (vm2); } Modified: head/sys/vm/vm_map.h ============================================================================== --- head/sys/vm/vm_map.h Thu Feb 23 20:58:52 2012 (r232070) +++ head/sys/vm/vm_map.h Thu Feb 23 21:07:16 2012 (r232071) @@ -139,6 +139,7 @@ struct vm_map_entry { #define MAP_ENTRY_GROWS_UP 0x2000 /* Bottom-up stacks */ #define MAP_ENTRY_WIRE_SKIPPED 0x4000 +#define MAP_ENTRY_VN_WRITECNT 0x8000 /* writeable vnode mapping */ #ifdef _KERNEL static __inline u_char @@ -315,6 +316,7 @@ long vmspace_wired_count(struct vmspace #define MAP_DISABLE_SYNCER 0x0020 #define MAP_DISABLE_COREDUMP 0x0100 #define MAP_PREFAULT_MADVISE 0x0200 /* from (user) madvise request */ +#define MAP_VN_WRITECOUNT 0x0400 #define MAP_STACK_GROWS_DOWN 0x1000 #define MAP_STACK_GROWS_UP 0x2000 #define MAP_ACC_CHARGED 0x4000 Modified: head/sys/vm/vm_mmap.c ============================================================================== --- head/sys/vm/vm_mmap.c Thu Feb 23 20:58:52 2012 (r232070) +++ head/sys/vm/vm_mmap.c Thu Feb 23 21:07:16 2012 (r232071) @@ -81,6 +81,7 @@ __FBSDID("$FreeBSD$"); #include <vm/vm_pageout.h> #include <vm/vm_extern.h> #include <vm/vm_page.h> +#include <vm/vnode_pager.h> #ifdef HWPMC_HOOKS #include <sys/pmckern.h> @@ -93,7 +94,7 @@ struct sbrk_args { #endif static int vm_mmap_vnode(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, - int *, struct vnode *, vm_ooffset_t *, vm_object_t *); + int *, struct vnode *, vm_ooffset_t *, vm_object_t *, boolean_t *); static int vm_mmap_cdev(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, int *, struct cdev *, vm_ooffset_t *, vm_object_t *); static int vm_mmap_shm(struct thread *, vm_size_t, vm_prot_t, vm_prot_t *, @@ -1218,28 +1219,33 @@ sys_munlock(td, uap) /* * vm_mmap_vnode() * - * MPSAFE - * * Helper function for vm_mmap. Perform sanity check specific for mmap * operations on vnodes. + * + * For VCHR vnodes, the vnode lock is held over the call to + * vm_mmap_cdev() to keep vp->v_rdev valid. */ int vm_mmap_vnode(struct thread *td, vm_size_t objsize, vm_prot_t prot, vm_prot_t *maxprotp, int *flagsp, - struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp) + struct vnode *vp, vm_ooffset_t *foffp, vm_object_t *objp, + boolean_t *writecounted) { struct vattr va; vm_object_t obj; vm_offset_t foff; struct mount *mp; struct ucred *cred; - int error, flags; - int vfslocked; + int error, flags, locktype, vfslocked; mp = vp->v_mount; cred = td->td_ucred; + if ((*maxprotp & VM_PROT_WRITE) && (*flagsp & MAP_SHARED)) + locktype = LK_EXCLUSIVE; + else + locktype = LK_SHARED; vfslocked = VFS_LOCK_GIANT(mp); - if ((error = vget(vp, LK_SHARED, td)) != 0) { + if ((error = vget(vp, locktype, td)) != 0) { VFS_UNLOCK_GIANT(vfslocked); return (error); } @@ -1256,8 +1262,20 @@ vm_mmap_vnode(struct thread *td, vm_size } if (obj->handle != vp) { vput(vp); - vp = (struct vnode*)obj->handle; - vget(vp, LK_SHARED, td); + vp = (struct vnode *)obj->handle; + /* + * Bypass filesystems obey the mpsafety of the + * underlying fs. + */ + error = vget(vp, locktype, td); + if (error != 0) { + VFS_UNLOCK_GIANT(vfslocked); + return (error); + } + if (locktype == LK_EXCLUSIVE) { + *writecounted = TRUE; + vnode_pager_update_writecount(obj, 0, objsize); + } } } else if (vp->v_type == VCHR) { error = vm_mmap_cdev(td, objsize, prot, maxprotp, flagsp, @@ -1293,7 +1311,7 @@ vm_mmap_vnode(struct thread *td, vm_size objsize = round_page(va.va_size); if (va.va_nlink == 0) flags |= MAP_NOSYNC; - obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, td->td_ucred); + obj = vm_pager_allocate(OBJT_VNODE, vp, objsize, prot, foff, cred); if (obj == NULL) { error = ENOMEM; goto done; @@ -1432,6 +1450,7 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, int rv = KERN_SUCCESS; int docow, error; struct thread *td = curthread; + boolean_t writecounted; if (size == 0) return (0); @@ -1470,6 +1489,8 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, return (EINVAL); fitit = FALSE; } + writecounted = FALSE; + /* * Lookup/allocate object. */ @@ -1480,7 +1501,7 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, break; case OBJT_VNODE: error = vm_mmap_vnode(td, size, prot, &maxprot, &flags, - handle, &foff, &object); + handle, &foff, &object, &writecounted); break; case OBJT_SWAP: error = vm_mmap_shm(td, size, prot, &maxprot, &flags, @@ -1520,6 +1541,8 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, /* Shared memory is also shared with children. */ if (flags & MAP_SHARED) docow |= MAP_INHERIT_SHARE; + if (writecounted) + docow |= MAP_VN_WRITECOUNT; if (flags & MAP_STACK) rv = vm_map_stack(map, *addr, size, prot, maxprot, @@ -1537,7 +1560,12 @@ vm_mmap(vm_map_t map, vm_offset_t *addr, * Lose the object reference. Will destroy the * object if it's an unnamed anonymous mapping * or named anonymous without other references. + * + * If this mapping was accounted for in the vnode's + * writecount, then undo that now. */ + if (writecounted) + vnode_pager_release_writecount(object, 0, size); vm_object_deallocate(object); } Modified: head/sys/vm/vm_object.h ============================================================================== --- head/sys/vm/vm_object.h Thu Feb 23 20:58:52 2012 (r232070) +++ head/sys/vm/vm_object.h Thu Feb 23 21:07:16 2012 (r232071) @@ -112,6 +112,7 @@ struct vm_object { */ struct { off_t vnp_size; + vm_ooffset_t writemappings; } vnp; /* Modified: head/sys/vm/vnode_pager.c ============================================================================== --- head/sys/vm/vnode_pager.c Thu Feb 23 20:58:52 2012 (r232070) +++ head/sys/vm/vnode_pager.c Thu Feb 23 21:07:16 2012 (r232071) @@ -222,6 +222,7 @@ retry: object = vm_object_allocate(OBJT_VNODE, OFF_TO_IDX(round_page(size))); object->un_pager.vnp.vnp_size = size; + object->un_pager.vnp.writemappings = 0; object->handle = handle; VI_LOCK(vp); @@ -268,10 +269,16 @@ vnode_pager_dealloc(object) wakeup(object); } ASSERT_VOP_ELOCKED(vp, "vnode_pager_dealloc"); + if (object->un_pager.vnp.writemappings > 0) { + object->un_pager.vnp.writemappings = 0; + vp->v_writecount--; + } vp->v_object = NULL; vp->v_vflag &= ~VV_TEXT; + VM_OBJECT_UNLOCK(object); while (refs-- > 0) vunref(vp); + VM_OBJECT_LOCK(object); } static boolean_t @@ -1215,3 +1222,81 @@ vnode_pager_undirty_pages(vm_page_t *ma, } VM_OBJECT_UNLOCK(obj); } + +void +vnode_pager_update_writecount(vm_object_t object, vm_offset_t start, + vm_offset_t end) +{ + struct vnode *vp; + vm_ooffset_t old_wm; + + VM_OBJECT_LOCK(object); + if (object->type != OBJT_VNODE) { + VM_OBJECT_UNLOCK(object); + return; + } + old_wm = object->un_pager.vnp.writemappings; + object->un_pager.vnp.writemappings += (vm_ooffset_t)end - start; + vp = object->handle; + if (old_wm == 0 && object->un_pager.vnp.writemappings != 0) { + ASSERT_VOP_ELOCKED(vp, "v_writecount inc"); + vp->v_writecount++; + } else if (old_wm != 0 && object->un_pager.vnp.writemappings == 0) { + ASSERT_VOP_ELOCKED(vp, "v_writecount dec"); + vp->v_writecount--; + } + VM_OBJECT_UNLOCK(object); +} + +void +vnode_pager_release_writecount(vm_object_t object, vm_offset_t start, + vm_offset_t end) +{ + struct vnode *vp; + struct mount *mp; + vm_offset_t inc; + int vfslocked; + + VM_OBJECT_LOCK(object); + + /* + * First, recheck the object type to account for the race when + * the vnode is reclaimed. + */ + if (object->type != OBJT_VNODE) { + VM_OBJECT_UNLOCK(object); + return; + } + + /* + * Optimize for the case when writemappings is not going to + * zero. + */ + inc = end - start; + if (object->un_pager.vnp.writemappings != inc) { + object->un_pager.vnp.writemappings -= inc; + VM_OBJECT_UNLOCK(object); + return; + } + + vp = object->handle; + vhold(vp); + VM_OBJECT_UNLOCK(object); + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + mp = NULL; + vn_start_write(vp, &mp, V_WAIT); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + + /* + * Decrement the object's writemappings, by swapping the start + * and end arguments for vnode_pager_update_writecount(). If + * there was not a race with vnode reclaimation, then the + * vnode's v_writecount is decremented. + */ + vnode_pager_update_writecount(object, end, start); + VOP_UNLOCK(vp, 0); + vdrop(vp); + if (mp != NULL) + vn_finished_write(mp); + VFS_UNLOCK_GIANT(vfslocked); +} Modified: head/sys/vm/vnode_pager.h ============================================================================== --- head/sys/vm/vnode_pager.h Thu Feb 23 20:58:52 2012 (r232070) +++ head/sys/vm/vnode_pager.h Thu Feb 23 21:07:16 2012 (r232071) @@ -46,7 +46,11 @@ int vnode_pager_generic_putpages(struct int count, boolean_t sync, int *rtvals); +void vnode_pager_release_writecount(vm_object_t object, vm_offset_t start, + vm_offset_t end); void vnode_pager_undirty_pages(vm_page_t *ma, int *rtvals, int written); +void vnode_pager_update_writecount(vm_object_t object, vm_offset_t start, + vm_offset_t end); #endif /* _KERNEL */ #endif /* _VNODE_PAGER_ */
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201202232107.q1NL7GHi023139>