From owner-svn-src-user@freebsd.org Wed Dec 30 08:55:15 2015 Return-Path: Delivered-To: svn-src-user@mailman.ysv.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) by mailman.ysv.freebsd.org (Postfix) with ESMTP id C8249A55B9A for ; Wed, 30 Dec 2015 08:55:15 +0000 (UTC) (envelope-from ngie@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id 8DB3A1631; Wed, 30 Dec 2015 08:55:15 +0000 (UTC) (envelope-from ngie@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id tBU8tESk095608; Wed, 30 Dec 2015 08:55:14 GMT (envelope-from ngie@FreeBSD.org) Received: (from ngie@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id tBU8tEae095600; Wed, 30 Dec 2015 08:55:14 GMT (envelope-from ngie@FreeBSD.org) Message-Id: <201512300855.tBU8tEae095600@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: ngie set sender to ngie@FreeBSD.org using -f From: Garrett Cooper Date: Wed, 30 Dec 2015 08:55:14 +0000 (UTC) To: src-committers@freebsd.org, svn-src-user@freebsd.org Subject: svn commit: r292909 - in user/ngie/stable-10-libnv: gnu/usr.bin/binutils/ld sys/dev/virtio/balloon sys/dev/xen/balloon sys/fs/nfs sys/fs/nfsclient sys/kern sys/sys sys/ufs/ffs X-SVN-Group: user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-user@freebsd.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: "SVN commit messages for the experimental " user" src tree" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 30 Dec 2015 08:55:16 -0000 Author: ngie Date: Wed Dec 30 08:55:13 2015 New Revision: 292909 URL: https://svnweb.freebsd.org/changeset/base/292909 Log: MFstable/10 @ r292908 Modified: user/ngie/stable-10-libnv/gnu/usr.bin/binutils/ld/Makefile user/ngie/stable-10-libnv/sys/dev/virtio/balloon/virtio_balloon.c user/ngie/stable-10-libnv/sys/dev/xen/balloon/balloon.c user/ngie/stable-10-libnv/sys/fs/nfs/nfsport.h user/ngie/stable-10-libnv/sys/fs/nfsclient/nfs_clnode.c user/ngie/stable-10-libnv/sys/fs/nfsclient/nfs_clport.c user/ngie/stable-10-libnv/sys/kern/vfs_subr.c user/ngie/stable-10-libnv/sys/sys/param.h user/ngie/stable-10-libnv/sys/ufs/ffs/ffs_vfsops.c Directory Properties: user/ngie/stable-10-libnv/ (props changed) Modified: user/ngie/stable-10-libnv/gnu/usr.bin/binutils/ld/Makefile ============================================================================== --- user/ngie/stable-10-libnv/gnu/usr.bin/binutils/ld/Makefile Wed Dec 30 08:53:12 2015 (r292908) +++ user/ngie/stable-10-libnv/gnu/usr.bin/binutils/ld/Makefile Wed Dec 30 08:55:13 2015 (r292909) @@ -52,6 +52,7 @@ CLEANFILES+= ldemul-list.h stringify.sed FILES= ${LDSCRIPTS:S|^|ldscripts/|} FILESDIR= ${SCRIPTDIR} +LINKS= ${BINDIR}/ld ${BINDIR}/ld.bfd HOST= ${TARGET_TUPLE} LIBSEARCHPATH= \"=/lib\":\"=/usr/lib\" Modified: user/ngie/stable-10-libnv/sys/dev/virtio/balloon/virtio_balloon.c ============================================================================== --- user/ngie/stable-10-libnv/sys/dev/virtio/balloon/virtio_balloon.c Wed Dec 30 08:53:12 2015 (r292908) +++ user/ngie/stable-10-libnv/sys/dev/virtio/balloon/virtio_balloon.c Wed Dec 30 08:55:13 2015 (r292909) @@ -438,8 +438,7 @@ vtballoon_alloc_page(struct vtballoon_so { vm_page_t m; - m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_WIRED | - VM_ALLOC_NOOBJ); + m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ); if (m != NULL) sc->vtballoon_current_npages++; @@ -450,7 +449,6 @@ static void vtballoon_free_page(struct vtballoon_softc *sc, vm_page_t m) { - vm_page_unwire(m, 0); vm_page_free(m); sc->vtballoon_current_npages--; } Modified: user/ngie/stable-10-libnv/sys/dev/xen/balloon/balloon.c ============================================================================== --- user/ngie/stable-10-libnv/sys/dev/xen/balloon/balloon.c Wed Dec 30 08:53:12 2015 (r292908) +++ user/ngie/stable-10-libnv/sys/dev/xen/balloon/balloon.c Wed Dec 30 08:55:13 2015 (r292909) @@ -94,13 +94,8 @@ SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, SYSCTL_ULONG(_dev_xen_balloon, OID_AUTO, high_mem, CTLFLAG_RD, &bs.balloon_high, 0, "High-mem balloon"); -struct balloon_entry { - vm_page_t page; - STAILQ_ENTRY(balloon_entry) list; -}; - /* List of ballooned pages, threaded through the mem_map array. */ -static STAILQ_HEAD(,balloon_entry) ballooned_pages; +static TAILQ_HEAD(,vm_page) ballooned_pages; /* Main work function, always executed in process context. */ static void balloon_process(void *unused); @@ -110,47 +105,6 @@ static void balloon_process(void *unused #define WPRINTK(fmt, args...) \ printk(KERN_WARNING "xen_mem: " fmt, ##args) -/* balloon_append: add the given page to the balloon. */ -static int -balloon_append(vm_page_t page) -{ - struct balloon_entry *entry; - - mtx_assert(&balloon_mutex, MA_OWNED); - - entry = malloc(sizeof(struct balloon_entry), M_BALLOON, M_NOWAIT); - if (!entry) - return (ENOMEM); - entry->page = page; - STAILQ_INSERT_HEAD(&ballooned_pages, entry, list); - bs.balloon_low++; - - return (0); -} - -/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ -static vm_page_t -balloon_retrieve(void) -{ - vm_page_t page; - struct balloon_entry *entry; - - mtx_assert(&balloon_mutex, MA_OWNED); - - if (STAILQ_EMPTY(&ballooned_pages)) - return (NULL); - - entry = STAILQ_FIRST(&ballooned_pages); - STAILQ_REMOVE_HEAD(&ballooned_pages, list); - - page = entry->page; - free(entry, M_BALLOON); - - bs.balloon_low--; - - return (page); -} - static unsigned long current_target(void) { @@ -203,7 +157,6 @@ static int increase_reservation(unsigned long nr_pages) { unsigned long pfn, i; - struct balloon_entry *entry; vm_page_t page; long rc; struct xen_memory_reservation reservation = { @@ -217,10 +170,9 @@ increase_reservation(unsigned long nr_pa if (nr_pages > nitems(frame_list)) nr_pages = nitems(frame_list); - for (entry = STAILQ_FIRST(&ballooned_pages), i = 0; - i < nr_pages; i++, entry = STAILQ_NEXT(entry, list)) { - KASSERT(entry, ("ballooned_pages list corrupt")); - page = entry->page; + for (page = TAILQ_FIRST(&ballooned_pages), i = 0; + i < nr_pages; i++, page = TAILQ_NEXT(page, plinks.q)) { + KASSERT(page != NULL, ("ballooned_pages list corrupt")); frame_list[i] = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT); } @@ -245,8 +197,10 @@ increase_reservation(unsigned long nr_pa } for (i = 0; i < nr_pages; i++) { - page = balloon_retrieve(); - KASSERT(page, ("balloon_retrieve failed")); + page = TAILQ_FIRST(&ballooned_pages); + KASSERT(page != NULL, ("Unable to get ballooned page")); + TAILQ_REMOVE(&ballooned_pages, page, plinks.q); + bs.balloon_low--; pfn = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT); KASSERT((xen_feature(XENFEAT_auto_translated_physmap) || @@ -255,7 +209,6 @@ increase_reservation(unsigned long nr_pa set_phys_to_machine(pfn, frame_list[i]); - vm_page_unwire(page, 0); vm_page_free(page); } @@ -286,24 +239,29 @@ decrease_reservation(unsigned long nr_pa for (i = 0; i < nr_pages; i++) { if ((page = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | - VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { + VM_ALLOC_ZERO)) == NULL) { nr_pages = i; need_sleep = 1; break; } + if ((page->flags & PG_ZERO) == 0) { + /* + * Zero the page, or else we might be leaking + * important data to other domains on the same + * host. Xen doesn't scrub ballooned out memory + * pages, the guest is in charge of making + * sure that no information is leaked. + */ + pmap_zero_page(page); + } + pfn = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT); frame_list[i] = PFNTOMFN(pfn); set_phys_to_machine(pfn, INVALID_P2M_ENTRY); - if (balloon_append(page) != 0) { - vm_page_unwire(page, 0); - vm_page_free(page); - - nr_pages = i; - need_sleep = 1; - break; - } + TAILQ_INSERT_HEAD(&ballooned_pages, page, plinks.q); + bs.balloon_low++; } set_xen_guest_handle(reservation.extent_start, frame_list); @@ -438,7 +396,8 @@ balloon_init(void *arg) /* Initialise the balloon with excess memory space. */ for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) { page = PHYS_TO_VM_PAGE(pfn << PAGE_SHIFT); - balloon_append(page); + TAILQ_INSERT_HEAD(&ballooned_pages, page, plinks.q); + bs.balloon_low++; } #undef max_pfn #endif Modified: user/ngie/stable-10-libnv/sys/fs/nfs/nfsport.h ============================================================================== --- user/ngie/stable-10-libnv/sys/fs/nfs/nfsport.h Wed Dec 30 08:53:12 2015 (r292908) +++ user/ngie/stable-10-libnv/sys/fs/nfs/nfsport.h Wed Dec 30 08:55:13 2015 (r292909) @@ -981,6 +981,13 @@ struct nfsreq { #define NFSVNO_DELEGOK(v) (1) #endif +/* + * Name used by getnewvnode() to describe filesystem, "newnfs". + * For perfomance reasons it is useful to have the same string + * used in both places that call getnewvnode(). + */ +extern const char nfs_vnode_tag[]; + #endif /* _KERNEL */ #endif /* _NFS_NFSPORT_H */ Modified: user/ngie/stable-10-libnv/sys/fs/nfsclient/nfs_clnode.c ============================================================================== --- user/ngie/stable-10-libnv/sys/fs/nfsclient/nfs_clnode.c Wed Dec 30 08:53:12 2015 (r292908) +++ user/ngie/stable-10-libnv/sys/fs/nfsclient/nfs_clnode.c Wed Dec 30 08:55:13 2015 (r292909) @@ -66,6 +66,8 @@ MALLOC_DECLARE(M_NEWNFSREQ); uma_zone_t newnfsnode_zone; +const char nfs_vnode_tag[] = "newnfs"; + static void nfs_freesillyrename(void *arg, __unused int pending); void @@ -124,7 +126,7 @@ ncl_nget(struct mount *mntp, u_int8_t *f } np = uma_zalloc(newnfsnode_zone, M_WAITOK | M_ZERO); - error = getnewvnode("newnfs", mntp, &newnfs_vnodeops, &nvp); + error = getnewvnode(nfs_vnode_tag, mntp, &newnfs_vnodeops, &nvp); if (error) { uma_zfree(newnfsnode_zone, np); return (error); @@ -332,4 +334,3 @@ ncl_invalcaches(struct vnode *vp) KDTRACE_NFS_ATTRCACHE_FLUSH_DONE(vp); mtx_unlock(&np->n_mtx); } - Modified: user/ngie/stable-10-libnv/sys/fs/nfsclient/nfs_clport.c ============================================================================== --- user/ngie/stable-10-libnv/sys/fs/nfsclient/nfs_clport.c Wed Dec 30 08:53:12 2015 (r292908) +++ user/ngie/stable-10-libnv/sys/fs/nfsclient/nfs_clport.c Wed Dec 30 08:55:13 2015 (r292909) @@ -199,7 +199,7 @@ nfscl_nget(struct mount *mntp, struct vn } np = uma_zalloc(newnfsnode_zone, M_WAITOK | M_ZERO); - error = getnewvnode("newnfs", mntp, &newnfs_vnodeops, &nvp); + error = getnewvnode(nfs_vnode_tag, mntp, &newnfs_vnodeops, &nvp); if (error) { uma_zfree(newnfsnode_zone, np); FREE((caddr_t)nfhp, M_NFSFH); Modified: user/ngie/stable-10-libnv/sys/kern/vfs_subr.c ============================================================================== --- user/ngie/stable-10-libnv/sys/kern/vfs_subr.c Wed Dec 30 08:53:12 2015 (r292908) +++ user/ngie/stable-10-libnv/sys/kern/vfs_subr.c Wed Dec 30 08:55:13 2015 (r292909) @@ -145,24 +145,51 @@ int vttoif_tab[10] = { static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* - * Free vnode target. Free vnodes may simply be files which have been stat'd - * but not read. This is somewhat common, and a small cache of such files - * should be kept to avoid recreation costs. + * "Free" vnode target. Free vnodes are rarely completely free, but are + * just ones that are cheap to recycle. Usually they are for files which + * have been stat'd but not read; these usually have inode and namecache + * data attached to them. This target is the preferred minimum size of a + * sub-cache consisting mostly of such files. The system balances the size + * of this sub-cache with its complement to try to prevent either from + * thrashing while the other is relatively inactive. The targets express + * a preference for the best balance. + * + * "Above" this target there are 2 further targets (watermarks) related + * to recyling of free vnodes. In the best-operating case, the cache is + * exactly full, the free list has size between vlowat and vhiwat above the + * free target, and recycling from it and normal use maintains this state. + * Sometimes the free list is below vlowat or even empty, but this state + * is even better for immediate use provided the cache is not full. + * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free + * ones) to reach one of these states. The watermarks are currently hard- + * coded as 4% and 9% of the available space higher. These and the default + * of 25% for wantfreevnodes are too large if the memory size is large. + * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim + * whenever vnlru_proc() becomes active. */ static u_long wantfreevnodes; -SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); -/* Number of vnodes in the free list. */ +SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, + &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes"); static u_long freevnodes; -SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, - "Number of vnodes in the free list"); +SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, + &freevnodes, 0, "Number of \"free\" vnodes"); +/* + * The vfs.vlru_allow_cache_src sysctl variable is no longer used but + * the sysctl remains to provide ABI compatibility. The new code frees + * namecache sources as the last chance to satisfy the highest watermark, + * instead of selecting the source vnodes randomly. This provides good + * enough behaviour to keep vn_fullpath() working in most situations. + * The filesystem layout with deep trees, where the depricated knob was + * required, is thus handled automatically. + */ static int vlru_allow_cache_src; SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW, - &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode"); + &vlru_allow_cache_src, 0, "Placeholder for API compatibility (unused)"); static u_long recycles_count; SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0, - "Number of vnodes recycled to avoid exceding kern.maxvnodes"); + "Number of vnodes recycled to meet vnode cache targets"); /* * Various variables used for debugging the new implementation of @@ -272,14 +299,13 @@ static int syncer_worklist_len; static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } syncer_state; -/* - * Number of vnodes we want to exist at any one time. This is mostly used - * to size hash tables in vnode-related code. It is normally not used in - * getnewvnode(), as wantfreevnodes is normally nonzero.) - * - * XXX desiredvnodes is historical cruft and should not exist. - */ +/* Target for maximum number of vnodes. */ int desiredvnodes; +static int gapvnodes; /* gap between wanted and desired */ +static int vhiwat; /* enough extras after expansion */ +static int vlowat; /* minimal extras before expansion */ +static int vstir; /* nonzero to stir non-free vnodes */ +static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ static int sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS) @@ -290,6 +316,8 @@ sysctl_update_desiredvnodes(SYSCTL_HANDL if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0) return (error); if (old_desiredvnodes != desiredvnodes) { + wantfreevnodes = desiredvnodes / 4; + /* XXX locking seems to be incomplete. */ vfs_hash_changesize(desiredvnodes); cache_changesize(desiredvnodes); } @@ -298,9 +326,9 @@ sysctl_update_desiredvnodes(SYSCTL_HANDL SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0, - sysctl_update_desiredvnodes, "I", "Maximum number of vnodes"); + sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes"); SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, - &wantfreevnodes, 0, "Minimum number of vnodes (legacy)"); + &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); static int vnlru_nowhere; SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); @@ -331,11 +359,71 @@ PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_tr * * Reevaluate the following cap on the number of vnodes after the physical * memory size exceeds 512GB. In the limit, as the physical memory size - * grows, the ratio of physical pages to vnodes approaches sixteen to one. + * grows, the ratio of the memory size in KB to to vnodes approaches 64:1. */ #ifndef MAXVNODES_MAX -#define MAXVNODES_MAX (512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16)) +#define MAXVNODES_MAX (512 * 1024 * 1024 / 64) /* 8M */ #endif + +/* + * Initialize a vnode as it first enters the zone. + */ +static int +vnode_init(void *mem, int size, int flags) +{ + struct vnode *vp; + struct bufobj *bo; + + vp = mem; + bzero(vp, size); + /* + * Setup locks. + */ + vp->v_vnlock = &vp->v_lock; + mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); + /* + * By default, don't allow shared locks unless filesystems opt-in. + */ + lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT, + LK_NOSHARE | LK_IS_VNODE); + /* + * Initialize bufobj. + */ + bo = &vp->v_bufobj; + bo->__bo_vnode = vp; + rw_init(BO_LOCKPTR(bo), "bufobj interlock"); + bo->bo_private = vp; + TAILQ_INIT(&bo->bo_clean.bv_hd); + TAILQ_INIT(&bo->bo_dirty.bv_hd); + /* + * Initialize namecache. + */ + LIST_INIT(&vp->v_cache_src); + TAILQ_INIT(&vp->v_cache_dst); + /* + * Initialize rangelocks. + */ + rangelock_init(&vp->v_rl); + return (0); +} + +/* + * Free a vnode when it is cleared from the zone. + */ +static void +vnode_fini(void *mem, int size) +{ + struct vnode *vp; + struct bufobj *bo; + + vp = mem; + rangelock_destroy(&vp->v_rl); + lockdestroy(vp->v_vnlock); + mtx_destroy(&vp->v_interlock); + bo = &vp->v_bufobj; + rw_destroy(BO_LOCKPTR(bo)); +} + static void vntblinit(void *dummy __unused) { @@ -345,15 +433,16 @@ vntblinit(void *dummy __unused) /* * Desiredvnodes is a function of the physical memory size and the * kernel's heap size. Generally speaking, it scales with the - * physical memory size. The ratio of desiredvnodes to physical pages - * is one to four until desiredvnodes exceeds 98,304. Thereafter, the - * marginal ratio of desiredvnodes to physical pages is one to - * sixteen. However, desiredvnodes is limited by the kernel's heap + * physical memory size. The ratio of desiredvnodes to the physical + * memory size is 1:16 until desiredvnodes exceeds 98,304. + * Thereafter, the + * marginal ratio of desiredvnodes to the physical memory size is + * 1:64. However, desiredvnodes is limited by the kernel's heap * size. The memory required by desiredvnodes vnodes and vm objects - * may not exceed one seventh of the kernel's heap size. + * must not exceed 1/7th of the kernel's heap size. */ - physvnodes = maxproc + cnt.v_page_count / 16 + 3 * min(98304 * 4, - cnt.v_page_count) / 16; + physvnodes = maxproc + pgtok(cnt.v_page_count) / 64 + + 3 * min(98304 * 16, pgtok(cnt.v_page_count)) / 64; virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) + sizeof(struct vnode))); desiredvnodes = min(physvnodes, virtvnodes); @@ -368,7 +457,7 @@ vntblinit(void *dummy __unused) TAILQ_INIT(&vnode_free_list); mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF); vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL, - NULL, NULL, UMA_ALIGN_PTR, 0); + vnode_init, vnode_fini, UMA_ALIGN_PTR, 0); vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); /* @@ -742,35 +831,41 @@ vattr_null(struct vattr *vap) * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. */ static int -vlrureclaim(struct mount *mp) +vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger) { struct vnode *vp; - int done; - int trigger; - int usevnodes; - int count; + int count, done, target; - /* - * Calculate the trigger point, don't allow user - * screwups to blow us up. This prevents us from - * recycling vnodes with lots of resident pages. We - * aren't trying to free memory, we are trying to - * free vnodes. - */ - usevnodes = desiredvnodes; - if (usevnodes <= 0) - usevnodes = 1; - trigger = cnt.v_page_count * 2 / usevnodes; done = 0; vn_start_write(NULL, &mp, V_WAIT); MNT_ILOCK(mp); - count = mp->mnt_nvnodelistsize / 10 + 1; - while (count != 0) { + count = mp->mnt_nvnodelistsize; + target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1); + target = target / 10 + 1; + while (count != 0 && done < target) { vp = TAILQ_FIRST(&mp->mnt_nvnodelist); while (vp != NULL && vp->v_type == VMARKER) vp = TAILQ_NEXT(vp, v_nmntvnodes); if (vp == NULL) break; + /* + * XXX LRU is completely broken for non-free vnodes. First + * by calling here in mountpoint order, then by moving + * unselected vnodes to the end here, and most grossly by + * removing the vlruvp() function that was supposed to + * maintain the order. (This function was born broken + * since syncer problems prevented it doing anything.) The + * order is closer to LRC (C = Created). + * + * LRU reclaiming of vnodes seems to have last worked in + * FreeBSD-3 where LRU wasn't mentioned under any spelling. + * Then there was no hold count, and inactive vnodes were + * simply put on the free list in LRU order. The separate + * lists also break LRU. We prefer to reclaim from the + * free list for technical reasons. This tends to thrash + * the free list to keep very unrecently used held vnodes. + * The problem is mitigated by keeping the free list large. + */ TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); --count; @@ -779,10 +874,12 @@ vlrureclaim(struct mount *mp) /* * If it's been deconstructed already, it's still * referenced, or it exceeds the trigger, skip it. + * Also skip free vnodes. We are trying to make space + * to expand the free list, not reduce it. */ if (vp->v_usecount || - (!vlru_allow_cache_src && - !LIST_EMPTY(&(vp)->v_cache_src)) || + (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || + ((vp->v_iflag & VI_FREE) != 0) || (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VI_UNLOCK(vp); @@ -808,8 +905,8 @@ vlrureclaim(struct mount *mp) * vnode lock before our VOP_LOCK() call fails. */ if (vp->v_usecount || - (!vlru_allow_cache_src && - !LIST_EMPTY(&(vp)->v_cache_src)) || + (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || + (vp->v_iflag & VI_FREE) != 0 || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VOP_UNLOCK(vp, LK_INTERLOCK); @@ -842,7 +939,7 @@ relock_mnt: } /* - * Attempt to keep the free list at wantfreevnodes length. + * Attempt to reduce the free list by the requested amount. */ static void vnlru_free(int count) @@ -899,6 +996,24 @@ vnlru_free(int count) mtx_lock(&vnode_free_list_mtx); } } + +/* XXX some names and initialization are bad for limits and watermarks. */ +static int +vspace(void) +{ + int space; + + gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); + vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ + vlowat = vhiwat / 2; + if (numvnodes > desiredvnodes) + return (0); + space = desiredvnodes - numvnodes; + if (freevnodes > wantfreevnodes) + space += freevnodes - wantfreevnodes; + return (space); +} + /* * Attempt to recycle vnodes in a context that is always safe to block. * Calling vlrurecycle() from the bowels of filesystem code has some @@ -911,18 +1026,36 @@ static void vnlru_proc(void) { struct mount *mp, *nmp; - int done; - struct proc *p = vnlruproc; + unsigned long ofreevnodes, onumvnodes; + int done, force, reclaim_nc_src, trigger, usevnodes; - EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, + EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, SHUTDOWN_PRI_FIRST); + force = 0; for (;;) { - kproc_suspend_check(p); + kproc_suspend_check(vnlruproc); mtx_lock(&vnode_free_list_mtx); - if (freevnodes > wantfreevnodes) - vnlru_free(freevnodes - wantfreevnodes); - if (numvnodes <= desiredvnodes * 9 / 10) { + /* + * If numvnodes is too large (due to desiredvnodes being + * adjusted using its sysctl, or emergency growth), first + * try to reduce it by discarding from the free list. + */ + if (numvnodes > desiredvnodes && freevnodes > 0) + vnlru_free(ulmin(numvnodes - desiredvnodes, + freevnodes)); + /* + * Sleep if the vnode cache is in a good state. This is + * when it is not over-full and has space for about a 4% + * or 9% expansion (by growing its size or inexcessively + * reducing its free list). Otherwise, try to reclaim + * space for a 10% expansion. + */ + if (vstir && force == 0) { + force = 1; + vstir = 0; + } + if (vspace() >= vlowat && force == 0) { vnlruproc_sig = 0; wakeup(&vnlruproc_sig); msleep(vnlruproc, &vnode_free_list_mtx, @@ -931,30 +1064,66 @@ vnlru_proc(void) } mtx_unlock(&vnode_free_list_mtx); done = 0; + ofreevnodes = freevnodes; + onumvnodes = numvnodes; + /* + * Calculate parameters for recycling. These are the same + * throughout the loop to give some semblance of fairness. + * The trigger point is to avoid recycling vnodes with lots + * of resident pages. We aren't trying to free memory; we + * are trying to recycle or at least free vnodes. + */ + if (numvnodes <= desiredvnodes) + usevnodes = numvnodes - freevnodes; + else + usevnodes = numvnodes; + if (usevnodes <= 0) + usevnodes = 1; + /* + * The trigger value is is chosen to give a conservatively + * large value to ensure that it alone doesn't prevent + * making progress. The value can easily be so large that + * it is effectively infinite in some congested and + * misconfigured cases, and this is necessary. Normally + * it is about 8 to 100 (pages), which is quite large. + */ + trigger = cnt.v_page_count * 2 / usevnodes; + if (force < 2) + trigger = vsmalltrigger; + reclaim_nc_src = force >= 3; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } - done += vlrureclaim(mp); + done += vlrureclaim(mp, reclaim_nc_src, trigger); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); + if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) + uma_reclaim(); if (done == 0) { -#if 0 - /* These messages are temporary debugging aids */ - if (vnlru_nowhere < 5) - printf("vnlru process getting nowhere..\n"); - else if (vnlru_nowhere == 5) - printf("vnlru process messages stopped.\n"); -#endif + if (force == 0 || force == 1) { + force = 2; + continue; + } + if (force == 2) { + force = 3; + continue; + } + force = 0; vnlru_nowhere++; tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); } else kern_yield(PRI_USER); + /* + * After becoming active to expand above low water, keep + * active until above high water. + */ + force = vspace() < vhiwat; } } @@ -1028,22 +1197,31 @@ vtryrecycle(struct vnode *vp) return (0); } +static void +vcheckspace(void) +{ + + if (vspace() < vlowat && vnlruproc_sig == 0) { + vnlruproc_sig = 1; + wakeup(vnlruproc); + } +} + /* - * Wait for available vnodes. + * Wait if necessary for space for a new vnode. */ static int getnewvnode_wait(int suspended) { mtx_assert(&vnode_free_list_mtx, MA_OWNED); - if (numvnodes > desiredvnodes) { + if (numvnodes >= desiredvnodes) { if (suspended) { /* - * File system is beeing suspended, we cannot risk a - * deadlock here, so allocate new vnode anyway. + * The file system is being suspended. We cannot + * risk a deadlock here, so allow allocation of + * another vnode even if this would give too many. */ - if (freevnodes > wantfreevnodes) - vnlru_free(freevnodes - wantfreevnodes); return (0); } if (vnlruproc_sig == 0) { @@ -1053,18 +1231,34 @@ getnewvnode_wait(int suspended) msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, "vlruwk", hz); } - return (numvnodes > desiredvnodes ? ENFILE : 0); + /* Post-adjust like the pre-adjust in getnewvnode(). */ + if (numvnodes + 1 > desiredvnodes && freevnodes > 1) + vnlru_free(1); + return (numvnodes >= desiredvnodes ? ENFILE : 0); } +/* + * This hack is fragile, and probably not needed any more now that the + * watermark handling works. + */ void getnewvnode_reserve(u_int count) { struct thread *td; + /* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */ + /* XXX no longer so quick, but this part is not racy. */ + mtx_lock(&vnode_free_list_mtx); + if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes) + vnlru_free(ulmin(numvnodes + count - desiredvnodes, + freevnodes - wantfreevnodes)); + mtx_unlock(&vnode_free_list_mtx); + td = curthread; /* First try to be quick and racy. */ if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) { td->td_vp_reserv += count; + vcheckspace(); /* XXX no longer so quick, but more racy */ return; } else atomic_subtract_long(&numvnodes, count); @@ -1077,9 +1271,18 @@ getnewvnode_reserve(u_int count) atomic_add_long(&numvnodes, 1); } } + vcheckspace(); mtx_unlock(&vnode_free_list_mtx); } +/* + * This hack is fragile, especially if desiredvnodes or wantvnodes are + * misconfgured or changed significantly. Reducing desiredvnodes below + * the reserved amount should cause bizarre behaviour like reducing it + * below the number of active vnodes -- the system will try to reduce + * numvnodes to match, but should fail, so the subtraction below should + * not overflow. + */ void getnewvnode_drop_reserve(void) { @@ -1098,8 +1301,9 @@ getnewvnode(const char *tag, struct moun struct vnode **vpp) { struct vnode *vp; - struct bufobj *bo; struct thread *td; + struct lock_object *lo; + static int cyclecount; int error; CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); @@ -1110,57 +1314,77 @@ getnewvnode(const char *tag, struct moun goto alloc; } mtx_lock(&vnode_free_list_mtx); - /* - * Lend our context to reclaim vnodes if they've exceeded the max. - */ - if (freevnodes > wantfreevnodes) + if (numvnodes < desiredvnodes) + cyclecount = 0; + else if (cyclecount++ >= freevnodes) { + cyclecount = 0; + vstir = 1; + } + /* + * Grow the vnode cache if it will not be above its target max + * after growing. Otherwise, if the free list is nonempty, try + * to reclaim 1 item from it before growing the cache (possibly + * above its target max if the reclamation failed or is delayed). + * Otherwise, wait for some space. In all cases, schedule + * vnlru_proc() if we are getting short of space. The watermarks + * should be chosen so that we never wait or even reclaim from + * the free list to below its target minimum. + */ + if (numvnodes + 1 <= desiredvnodes) + ; + else if (freevnodes > 0) vnlru_free(1); - error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag & - MNTK_SUSPEND)); + else { + error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag & + MNTK_SUSPEND)); #if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ - if (error != 0) { - mtx_unlock(&vnode_free_list_mtx); - return (error); - } + if (error != 0) { + mtx_unlock(&vnode_free_list_mtx); + return (error); + } #endif + } + vcheckspace(); atomic_add_long(&numvnodes, 1); mtx_unlock(&vnode_free_list_mtx); alloc: atomic_add_long(&vnodes_created, 1); - vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO); - /* - * Setup locks. - */ - vp->v_vnlock = &vp->v_lock; - mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF); - /* - * By default, don't allow shared locks unless filesystems - * opt-in. - */ - lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE); + vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK); /* - * Initialize bufobj. + * Locks are given the generic name "vnode" when created. + * Follow the historic practice of using the filesystem + * name when they allocated, e.g., "zfs", "ufs", "nfs, etc. + * + * Locks live in a witness group keyed on their name. Thus, + * when a lock is renamed, it must also move from the witness + * group of its old name to the witness group of its new name. + * + * The change only needs to be made when the vnode moves + * from one filesystem type to another. We ensure that each + * filesystem use a single static name pointer for its tag so + * that we can compare pointers rather than doing a strcmp(). */ - bo = &vp->v_bufobj; - bo->__bo_vnode = vp; - rw_init(BO_LOCKPTR(bo), "bufobj interlock"); - bo->bo_ops = &buf_ops_bio; - bo->bo_private = vp; - TAILQ_INIT(&bo->bo_clean.bv_hd); - TAILQ_INIT(&bo->bo_dirty.bv_hd); + lo = &vp->v_vnlock->lock_object; + if (lo->lo_name != tag) { + lo->lo_name = tag; + WITNESS_DESTROY(lo); + WITNESS_INIT(lo, tag); + } /* - * Initialize namecache. + * By default, don't allow shared locks unless filesystems opt-in. */ - LIST_INIT(&vp->v_cache_src); - TAILQ_INIT(&vp->v_cache_dst); + vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE; /* * Finalize various vnode identity bits. */ + KASSERT(vp->v_object == NULL, ("stale v_object %p", vp)); + KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp)); + KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp)); vp->v_type = VNON; vp->v_tag = tag; vp->v_op = vops; v_incr_usecount(vp); - vp->v_data = NULL; + vp->v_bufobj.bo_ops = &buf_ops_bio; #ifdef MAC mac_vnode_init(vp); if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0) @@ -1169,11 +1393,10 @@ alloc: printf("NULL mp in getnewvnode()\n"); #endif if (mp != NULL) { - bo->bo_bsize = mp->mnt_stat.f_iosize; + vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize; if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) vp->v_vflag |= VV_NOKNOTE; } - rangelock_init(&vp->v_rl); /* * For the filesystems which do not use vfs_hash_insert(), @@ -2399,7 +2622,7 @@ vholdl(struct vnode *vp) mtx_lock(&vnode_free_list_mtx); TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist); freevnodes--; - vp->v_iflag &= ~(VI_FREE|VI_AGE); + vp->v_iflag &= ~VI_FREE; KASSERT((vp->v_iflag & VI_ACTIVE) == 0, ("Activating already active vnode")); vp->v_iflag |= VI_ACTIVE; @@ -2467,15 +2690,9 @@ vdropl(struct vnode *vp) v_actfreelist); mp->mnt_activevnodelistsize--; } - if (vp->v_iflag & VI_AGE) { - TAILQ_INSERT_HEAD(&vnode_free_list, vp, - v_actfreelist); - } else { - TAILQ_INSERT_TAIL(&vnode_free_list, vp, - v_actfreelist); - } + TAILQ_INSERT_TAIL(&vnode_free_list, vp, + v_actfreelist); freevnodes++; - vp->v_iflag &= ~VI_AGE; vp->v_iflag |= VI_FREE; mtx_unlock(&vnode_free_list_mtx); } else { @@ -2486,6 +2703,12 @@ vdropl(struct vnode *vp) } /* * The vnode has been marked for destruction, so free it. + * + * The vnode will be returned to the zone where it will + * normally remain until it is needed for another vnode. We + * need to cleanup (or verify that the cleanup has already + * been done) any residual data left from its current use + * so as not to contaminate the freshly allocated vnode. */ CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp); atomic_subtract_long(&numvnodes, 1); @@ -2506,20 +2729,25 @@ vdropl(struct vnode *vp) VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst")); VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src")); VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for ..")); + VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp, + ("Dangling rangelock waiters")); VI_UNLOCK(vp); #ifdef MAC mac_vnode_destroy(vp); #endif - if (vp->v_pollinfo != NULL) + if (vp->v_pollinfo != NULL) { destroy_vpollinfo(vp->v_pollinfo); + vp->v_pollinfo = NULL; + } #ifdef INVARIANTS /* XXX Elsewhere we detect an already freed vnode via NULL v_op. */ vp->v_op = NULL; #endif - rangelock_destroy(&vp->v_rl); - lockdestroy(vp->v_vnlock); - mtx_destroy(&vp->v_interlock); - rw_destroy(BO_LOCKPTR(bo)); + bzero(&vp->v_un, sizeof(vp->v_un)); + vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; + vp->v_iflag = 0; + vp->v_vflag = 0; + bo->bo_flag = 0; uma_zfree(vnode_zone, vp); } @@ -2884,6 +3112,7 @@ vgonel(struct vnode *vp) * Clear the advisory locks and wake up waiting threads. */ (void)VOP_ADVLOCKPURGE(vp); + vp->v_lockf = NULL; /* * Delete from old mount point vnode list. */ @@ -2986,8 +3215,6 @@ vn_printf(struct vnode *vp, const char * } if (vp->v_iflag & VI_MOUNT) *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***