Date: Tue, 23 Feb 2016 01:09:35 +0000 (UTC) From: Marius Strobl <marius@FreeBSD.org> To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-10@freebsd.org Subject: svn commit: r295905 - stable/10/sys/kern Message-ID: <201602230109.u1N19Z5j076529@repo.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: marius Date: Tue Feb 23 01:09:35 2016 New Revision: 295905 URL: https://svnweb.freebsd.org/changeset/base/295905 Log: In preparation for 10.3-RELEASE, temporarily revert the MFC of r291244 done as part of r292895 on stable/10 as that change causes hangs with ZFS and the cause on at least amd64 so far not understood. Discussed with: kib For further information see: https://lists.freebsd.org/pipermail/freebsd-stable/2016-February/084045.html PR: 207281 Approved by: re (gjb) Modified: stable/10/sys/kern/vfs_subr.c Directory Properties: stable/10/ (props changed) Modified: stable/10/sys/kern/vfs_subr.c ============================================================================== --- stable/10/sys/kern/vfs_subr.c Tue Feb 23 01:08:39 2016 (r295904) +++ stable/10/sys/kern/vfs_subr.c Tue Feb 23 01:09:35 2016 (r295905) @@ -145,51 +145,24 @@ int vttoif_tab[10] = { static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* - * "Free" vnode target. Free vnodes are rarely completely free, but are - * just ones that are cheap to recycle. Usually they are for files which - * have been stat'd but not read; these usually have inode and namecache - * data attached to them. This target is the preferred minimum size of a - * sub-cache consisting mostly of such files. The system balances the size - * of this sub-cache with its complement to try to prevent either from - * thrashing while the other is relatively inactive. The targets express - * a preference for the best balance. - * - * "Above" this target there are 2 further targets (watermarks) related - * to recyling of free vnodes. In the best-operating case, the cache is - * exactly full, the free list has size between vlowat and vhiwat above the - * free target, and recycling from it and normal use maintains this state. - * Sometimes the free list is below vlowat or even empty, but this state - * is even better for immediate use provided the cache is not full. - * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free - * ones) to reach one of these states. The watermarks are currently hard- - * coded as 4% and 9% of the available space higher. These and the default - * of 25% for wantfreevnodes are too large if the memory size is large. - * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim - * whenever vnlru_proc() becomes active. + * Free vnode target. Free vnodes may simply be files which have been stat'd + * but not read. This is somewhat common, and a small cache of such files + * should be kept to avoid recreation costs. */ static u_long wantfreevnodes; -SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, - &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes"); +SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); +/* Number of vnodes in the free list. */ static u_long freevnodes; -SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, - &freevnodes, 0, "Number of \"free\" vnodes"); +SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, + "Number of vnodes in the free list"); -/* - * The vfs.vlru_allow_cache_src sysctl variable is no longer used but - * the sysctl remains to provide ABI compatibility. The new code frees - * namecache sources as the last chance to satisfy the highest watermark, - * instead of selecting the source vnodes randomly. This provides good - * enough behaviour to keep vn_fullpath() working in most situations. - * The filesystem layout with deep trees, where the depricated knob was - * required, is thus handled automatically. - */ static int vlru_allow_cache_src; SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW, - &vlru_allow_cache_src, 0, "Placeholder for API compatibility (unused)"); + &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode"); static u_long recycles_count; SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0, - "Number of vnodes recycled to meet vnode cache targets"); + "Number of vnodes recycled to avoid exceding kern.maxvnodes"); /* * Various variables used for debugging the new implementation of @@ -299,13 +272,14 @@ static int syncer_worklist_len; static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY } syncer_state; -/* Target for maximum number of vnodes. */ +/* + * Number of vnodes we want to exist at any one time. This is mostly used + * to size hash tables in vnode-related code. It is normally not used in + * getnewvnode(), as wantfreevnodes is normally nonzero.) + * + * XXX desiredvnodes is historical cruft and should not exist. + */ int desiredvnodes; -static int gapvnodes; /* gap between wanted and desired */ -static int vhiwat; /* enough extras after expansion */ -static int vlowat; /* minimal extras before expansion */ -static int vstir; /* nonzero to stir non-free vnodes */ -static volatile int vsmalltrigger = 8; /* pref to keep if > this many pages */ static int sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS) @@ -316,8 +290,6 @@ sysctl_update_desiredvnodes(SYSCTL_HANDL if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0) return (error); if (old_desiredvnodes != desiredvnodes) { - wantfreevnodes = desiredvnodes / 4; - /* XXX locking seems to be incomplete. */ vfs_hash_changesize(desiredvnodes); cache_changesize(desiredvnodes); } @@ -326,9 +298,9 @@ sysctl_update_desiredvnodes(SYSCTL_HANDL SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0, - sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes"); + sysctl_update_desiredvnodes, "I", "Maximum number of vnodes"); SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW, - &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)"); + &wantfreevnodes, 0, "Minimum number of vnodes (legacy)"); static int vnlru_nowhere; SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW, &vnlru_nowhere, 0, "Number of times the vnlru process ran without success"); @@ -359,10 +331,10 @@ PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_tr * * Reevaluate the following cap on the number of vnodes after the physical * memory size exceeds 512GB. In the limit, as the physical memory size - * grows, the ratio of the memory size in KB to to vnodes approaches 64:1. + * grows, the ratio of physical pages to vnodes approaches sixteen to one. */ #ifndef MAXVNODES_MAX -#define MAXVNODES_MAX (512 * 1024 * 1024 / 64) /* 8M */ +#define MAXVNODES_MAX (512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16)) #endif /* @@ -433,16 +405,15 @@ vntblinit(void *dummy __unused) /* * Desiredvnodes is a function of the physical memory size and the * kernel's heap size. Generally speaking, it scales with the - * physical memory size. The ratio of desiredvnodes to the physical - * memory size is 1:16 until desiredvnodes exceeds 98,304. - * Thereafter, the - * marginal ratio of desiredvnodes to the physical memory size is - * 1:64. However, desiredvnodes is limited by the kernel's heap + * physical memory size. The ratio of desiredvnodes to physical pages + * is one to four until desiredvnodes exceeds 98,304. Thereafter, the + * marginal ratio of desiredvnodes to physical pages is one to + * sixteen. However, desiredvnodes is limited by the kernel's heap * size. The memory required by desiredvnodes vnodes and vm objects - * must not exceed 1/7th of the kernel's heap size. + * may not exceed one seventh of the kernel's heap size. */ - physvnodes = maxproc + pgtok(cnt.v_page_count) / 64 + - 3 * min(98304 * 16, pgtok(cnt.v_page_count)) / 64; + physvnodes = maxproc + cnt.v_page_count / 16 + 3 * min(98304 * 4, + cnt.v_page_count) / 16; virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) + sizeof(struct vnode))); desiredvnodes = min(physvnodes, virtvnodes); @@ -831,41 +802,35 @@ vattr_null(struct vattr *vap) * you set kern.maxvnodes to. Do not set kern.maxvnodes too low. */ static int -vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger) +vlrureclaim(struct mount *mp) { struct vnode *vp; - int count, done, target; + int done; + int trigger; + int usevnodes; + int count; + /* + * Calculate the trigger point, don't allow user + * screwups to blow us up. This prevents us from + * recycling vnodes with lots of resident pages. We + * aren't trying to free memory, we are trying to + * free vnodes. + */ + usevnodes = desiredvnodes; + if (usevnodes <= 0) + usevnodes = 1; + trigger = cnt.v_page_count * 2 / usevnodes; done = 0; vn_start_write(NULL, &mp, V_WAIT); MNT_ILOCK(mp); - count = mp->mnt_nvnodelistsize; - target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1); - target = target / 10 + 1; - while (count != 0 && done < target) { + count = mp->mnt_nvnodelistsize / 10 + 1; + while (count != 0) { vp = TAILQ_FIRST(&mp->mnt_nvnodelist); while (vp != NULL && vp->v_type == VMARKER) vp = TAILQ_NEXT(vp, v_nmntvnodes); if (vp == NULL) break; - /* - * XXX LRU is completely broken for non-free vnodes. First - * by calling here in mountpoint order, then by moving - * unselected vnodes to the end here, and most grossly by - * removing the vlruvp() function that was supposed to - * maintain the order. (This function was born broken - * since syncer problems prevented it doing anything.) The - * order is closer to LRC (C = Created). - * - * LRU reclaiming of vnodes seems to have last worked in - * FreeBSD-3 where LRU wasn't mentioned under any spelling. - * Then there was no hold count, and inactive vnodes were - * simply put on the free list in LRU order. The separate - * lists also break LRU. We prefer to reclaim from the - * free list for technical reasons. This tends to thrash - * the free list to keep very unrecently used held vnodes. - * The problem is mitigated by keeping the free list large. - */ TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes); TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes); --count; @@ -874,12 +839,10 @@ vlrureclaim(struct mount *mp, int reclai /* * If it's been deconstructed already, it's still * referenced, or it exceeds the trigger, skip it. - * Also skip free vnodes. We are trying to make space - * to expand the free list, not reduce it. */ if (vp->v_usecount || - (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || - ((vp->v_iflag & VI_FREE) != 0) || + (!vlru_allow_cache_src && + !LIST_EMPTY(&(vp)->v_cache_src)) || (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VI_UNLOCK(vp); @@ -905,8 +868,8 @@ vlrureclaim(struct mount *mp, int reclai * vnode lock before our VOP_LOCK() call fails. */ if (vp->v_usecount || - (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) || - (vp->v_iflag & VI_FREE) != 0 || + (!vlru_allow_cache_src && + !LIST_EMPTY(&(vp)->v_cache_src)) || (vp->v_object != NULL && vp->v_object->resident_page_count > trigger)) { VOP_UNLOCK(vp, LK_INTERLOCK); @@ -939,7 +902,7 @@ relock_mnt: } /* - * Attempt to reduce the free list by the requested amount. + * Attempt to keep the free list at wantfreevnodes length. */ static void vnlru_free(int count) @@ -996,24 +959,6 @@ vnlru_free(int count) mtx_lock(&vnode_free_list_mtx); } } - -/* XXX some names and initialization are bad for limits and watermarks. */ -static int -vspace(void) -{ - int space; - - gapvnodes = imax(desiredvnodes - wantfreevnodes, 100); - vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */ - vlowat = vhiwat / 2; - if (numvnodes > desiredvnodes) - return (0); - space = desiredvnodes - numvnodes; - if (freevnodes > wantfreevnodes) - space += freevnodes - wantfreevnodes; - return (space); -} - /* * Attempt to recycle vnodes in a context that is always safe to block. * Calling vlrurecycle() from the bowels of filesystem code has some @@ -1026,36 +971,18 @@ static void vnlru_proc(void) { struct mount *mp, *nmp; - unsigned long ofreevnodes, onumvnodes; - int done, force, reclaim_nc_src, trigger, usevnodes; + int done; + struct proc *p = vnlruproc; - EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc, + EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p, SHUTDOWN_PRI_FIRST); - force = 0; for (;;) { - kproc_suspend_check(vnlruproc); + kproc_suspend_check(p); mtx_lock(&vnode_free_list_mtx); - /* - * If numvnodes is too large (due to desiredvnodes being - * adjusted using its sysctl, or emergency growth), first - * try to reduce it by discarding from the free list. - */ - if (numvnodes > desiredvnodes && freevnodes > 0) - vnlru_free(ulmin(numvnodes - desiredvnodes, - freevnodes)); - /* - * Sleep if the vnode cache is in a good state. This is - * when it is not over-full and has space for about a 4% - * or 9% expansion (by growing its size or inexcessively - * reducing its free list). Otherwise, try to reclaim - * space for a 10% expansion. - */ - if (vstir && force == 0) { - force = 1; - vstir = 0; - } - if (vspace() >= vlowat && force == 0) { + if (freevnodes > wantfreevnodes) + vnlru_free(freevnodes - wantfreevnodes); + if (numvnodes <= desiredvnodes * 9 / 10) { vnlruproc_sig = 0; wakeup(&vnlruproc_sig); msleep(vnlruproc, &vnode_free_list_mtx, @@ -1064,66 +991,30 @@ vnlru_proc(void) } mtx_unlock(&vnode_free_list_mtx); done = 0; - ofreevnodes = freevnodes; - onumvnodes = numvnodes; - /* - * Calculate parameters for recycling. These are the same - * throughout the loop to give some semblance of fairness. - * The trigger point is to avoid recycling vnodes with lots - * of resident pages. We aren't trying to free memory; we - * are trying to recycle or at least free vnodes. - */ - if (numvnodes <= desiredvnodes) - usevnodes = numvnodes - freevnodes; - else - usevnodes = numvnodes; - if (usevnodes <= 0) - usevnodes = 1; - /* - * The trigger value is is chosen to give a conservatively - * large value to ensure that it alone doesn't prevent - * making progress. The value can easily be so large that - * it is effectively infinite in some congested and - * misconfigured cases, and this is necessary. Normally - * it is about 8 to 100 (pages), which is quite large. - */ - trigger = cnt.v_page_count * 2 / usevnodes; - if (force < 2) - trigger = vsmalltrigger; - reclaim_nc_src = force >= 3; mtx_lock(&mountlist_mtx); for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) { if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) { nmp = TAILQ_NEXT(mp, mnt_list); continue; } - done += vlrureclaim(mp, reclaim_nc_src, trigger); + done += vlrureclaim(mp); mtx_lock(&mountlist_mtx); nmp = TAILQ_NEXT(mp, mnt_list); vfs_unbusy(mp); } mtx_unlock(&mountlist_mtx); - if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes) - uma_reclaim(); if (done == 0) { - if (force == 0 || force == 1) { - force = 2; - continue; - } - if (force == 2) { - force = 3; - continue; - } - force = 0; +#if 0 + /* These messages are temporary debugging aids */ + if (vnlru_nowhere < 5) + printf("vnlru process getting nowhere..\n"); + else if (vnlru_nowhere == 5) + printf("vnlru process messages stopped.\n"); +#endif vnlru_nowhere++; tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3); } else kern_yield(PRI_USER); - /* - * After becoming active to expand above low water, keep - * active until above high water. - */ - force = vspace() < vhiwat; } } @@ -1197,31 +1088,22 @@ vtryrecycle(struct vnode *vp) return (0); } -static void -vcheckspace(void) -{ - - if (vspace() < vlowat && vnlruproc_sig == 0) { - vnlruproc_sig = 1; - wakeup(vnlruproc); - } -} - /* - * Wait if necessary for space for a new vnode. + * Wait for available vnodes. */ static int getnewvnode_wait(int suspended) { mtx_assert(&vnode_free_list_mtx, MA_OWNED); - if (numvnodes >= desiredvnodes) { + if (numvnodes > desiredvnodes) { if (suspended) { /* - * The file system is being suspended. We cannot - * risk a deadlock here, so allow allocation of - * another vnode even if this would give too many. + * File system is beeing suspended, we cannot risk a + * deadlock here, so allocate new vnode anyway. */ + if (freevnodes > wantfreevnodes) + vnlru_free(freevnodes - wantfreevnodes); return (0); } if (vnlruproc_sig == 0) { @@ -1231,34 +1113,18 @@ getnewvnode_wait(int suspended) msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS, "vlruwk", hz); } - /* Post-adjust like the pre-adjust in getnewvnode(). */ - if (numvnodes + 1 > desiredvnodes && freevnodes > 1) - vnlru_free(1); - return (numvnodes >= desiredvnodes ? ENFILE : 0); + return (numvnodes > desiredvnodes ? ENFILE : 0); } -/* - * This hack is fragile, and probably not needed any more now that the - * watermark handling works. - */ void getnewvnode_reserve(u_int count) { struct thread *td; - /* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */ - /* XXX no longer so quick, but this part is not racy. */ - mtx_lock(&vnode_free_list_mtx); - if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes) - vnlru_free(ulmin(numvnodes + count - desiredvnodes, - freevnodes - wantfreevnodes)); - mtx_unlock(&vnode_free_list_mtx); - td = curthread; /* First try to be quick and racy. */ if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) { td->td_vp_reserv += count; - vcheckspace(); /* XXX no longer so quick, but more racy */ return; } else atomic_subtract_long(&numvnodes, count); @@ -1271,18 +1137,9 @@ getnewvnode_reserve(u_int count) atomic_add_long(&numvnodes, 1); } } - vcheckspace(); mtx_unlock(&vnode_free_list_mtx); } -/* - * This hack is fragile, especially if desiredvnodes or wantvnodes are - * misconfgured or changed significantly. Reducing desiredvnodes below - * the reserved amount should cause bizarre behaviour like reducing it - * below the number of active vnodes -- the system will try to reduce - * numvnodes to match, but should fail, so the subtraction below should - * not overflow. - */ void getnewvnode_drop_reserve(void) { @@ -1303,7 +1160,6 @@ getnewvnode(const char *tag, struct moun struct vnode *vp; struct thread *td; struct lock_object *lo; - static int cyclecount; int error; CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag); @@ -1314,37 +1170,19 @@ getnewvnode(const char *tag, struct moun goto alloc; } mtx_lock(&vnode_free_list_mtx); - if (numvnodes < desiredvnodes) - cyclecount = 0; - else if (cyclecount++ >= freevnodes) { - cyclecount = 0; - vstir = 1; - } - /* - * Grow the vnode cache if it will not be above its target max - * after growing. Otherwise, if the free list is nonempty, try - * to reclaim 1 item from it before growing the cache (possibly - * above its target max if the reclamation failed or is delayed). - * Otherwise, wait for some space. In all cases, schedule - * vnlru_proc() if we are getting short of space. The watermarks - * should be chosen so that we never wait or even reclaim from - * the free list to below its target minimum. - */ - if (numvnodes + 1 <= desiredvnodes) - ; - else if (freevnodes > 0) + /* + * Lend our context to reclaim vnodes if they've exceeded the max. + */ + if (freevnodes > wantfreevnodes) vnlru_free(1); - else { - error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag & - MNTK_SUSPEND)); + error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag & + MNTK_SUSPEND)); #if 0 /* XXX Not all VFS_VGET/ffs_vget callers check returns. */ - if (error != 0) { - mtx_unlock(&vnode_free_list_mtx); - return (error); - } -#endif + if (error != 0) { + mtx_unlock(&vnode_free_list_mtx); + return (error); } - vcheckspace(); +#endif atomic_add_long(&numvnodes, 1); mtx_unlock(&vnode_free_list_mtx); alloc:
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201602230109.u1N19Z5j076529>