From owner-svn-src-user@FreeBSD.ORG Sat Feb 20 16:34:42 2010 Return-Path: <owner-svn-src-user@FreeBSD.ORG> Delivered-To: svn-src-user@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 8DB991065679; Sat, 20 Feb 2010 16:34:42 +0000 (UTC) (envelope-from kib@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id 7A0D18FC13; Sat, 20 Feb 2010 16:34:42 +0000 (UTC) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.3/8.14.3) with ESMTP id o1KGYguV057943; Sat, 20 Feb 2010 16:34:42 GMT (envelope-from kib@svn.freebsd.org) Received: (from kib@localhost) by svn.freebsd.org (8.14.3/8.14.3/Submit) id o1KGYg2f057928; Sat, 20 Feb 2010 16:34:42 GMT (envelope-from kib@svn.freebsd.org) Message-Id: <201002201634.o1KGYg2f057928@svn.freebsd.org> From: Konstantin Belousov <kib@FreeBSD.org> Date: Sat, 20 Feb 2010 16:34:42 +0000 (UTC) To: src-committers@freebsd.org, svn-src-user@freebsd.org X-SVN-Group: user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r204132 - in user/kib/vm6/sys: conf dev/md kern sys ufs/ffs ufs/ufs vm X-BeenThere: svn-src-user@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: "SVN commit messages for the experimental " user" src tree" <svn-src-user.freebsd.org> List-Unsubscribe: <http://lists.freebsd.org/mailman/listinfo/svn-src-user>, <mailto:svn-src-user-request@freebsd.org?subject=unsubscribe> List-Archive: <http://lists.freebsd.org/pipermail/svn-src-user> List-Post: <mailto:svn-src-user@freebsd.org> List-Help: <mailto:svn-src-user-request@freebsd.org?subject=help> List-Subscribe: <http://lists.freebsd.org/mailman/listinfo/svn-src-user>, <mailto:svn-src-user-request@freebsd.org?subject=subscribe> X-List-Received-Date: Sat, 20 Feb 2010 16:34:42 -0000 Author: kib Date: Sat Feb 20 16:34:42 2010 New Revision: 204132 URL: http://svn.freebsd.org/changeset/base/204132 Log: Implementation of range locking for i/o and vm i/o. First vm_readwrite.c implementation by: jeff In collaboration with: pho Added: user/kib/vm6/sys/kern/kern_rangelock.c (contents, props changed) user/kib/vm6/sys/sys/rangelock.h (contents, props changed) user/kib/vm6/sys/vm/vm_readwrite.c (contents, props changed) Modified: user/kib/vm6/sys/conf/files user/kib/vm6/sys/dev/md/md.c user/kib/vm6/sys/kern/vfs_cluster.c user/kib/vm6/sys/kern/vfs_default.c user/kib/vm6/sys/kern/vfs_subr.c user/kib/vm6/sys/kern/vfs_vnops.c user/kib/vm6/sys/kern/vnode_if.src user/kib/vm6/sys/sys/buf.h user/kib/vm6/sys/sys/file.h user/kib/vm6/sys/sys/proc.h user/kib/vm6/sys/sys/vnode.h user/kib/vm6/sys/ufs/ffs/ffs_balloc.c user/kib/vm6/sys/ufs/ffs/ffs_softdep.c user/kib/vm6/sys/ufs/ffs/ffs_vnops.c user/kib/vm6/sys/ufs/ufs/ufs_vnops.c user/kib/vm6/sys/vm/vm_extern.h user/kib/vm6/sys/vm/vm_fault.c user/kib/vm6/sys/vm/vm_page.c user/kib/vm6/sys/vm/vm_page.h user/kib/vm6/sys/vm/vm_pageout.c user/kib/vm6/sys/vm/vm_pageout.h user/kib/vm6/sys/vm/vm_phys.c user/kib/vm6/sys/vm/vnode_pager.c Modified: user/kib/vm6/sys/conf/files ============================================================================== --- user/kib/vm6/sys/conf/files Sat Feb 20 16:32:33 2010 (r204131) +++ user/kib/vm6/sys/conf/files Sat Feb 20 16:34:42 2010 (r204132) @@ -2077,6 +2077,7 @@ kern/kern_poll.c optional device_pollin kern/kern_priv.c standard kern/kern_proc.c standard kern/kern_prot.c standard +kern/kern_rangelock.c standard kern/kern_resource.c standard kern/kern_rmlock.c standard kern/kern_rwlock.c standard @@ -2768,6 +2769,7 @@ vm/vm_page.c standard vm/vm_pageout.c standard vm/vm_pager.c standard vm/vm_phys.c standard +vm/vm_readwrite.c standard vm/vm_reserv.c standard vm/vm_unix.c standard vm/vm_zeroidle.c standard Modified: user/kib/vm6/sys/dev/md/md.c ============================================================================== --- user/kib/vm6/sys/dev/md/md.c Sat Feb 20 16:32:33 2010 (r204131) +++ user/kib/vm6/sys/dev/md/md.c Sat Feb 20 16:34:42 2010 (r204132) @@ -85,6 +85,7 @@ #include <vm/vm.h> #include <vm/vm_object.h> #include <vm/vm_page.h> +#include <vm/vm_pageout.h> #include <vm/vm_pager.h> #include <vm/swap_pager.h> #include <vm/uma.h> @@ -587,7 +588,7 @@ mdstart_swap(struct md_s *sc, struct bio { struct sf_buf *sf; int rv, offs, len, lastend; - vm_pindex_t i, lastp; + vm_pindex_t i, firstp, lastp; vm_page_t m; u_char *p; @@ -610,18 +611,26 @@ mdstart_swap(struct md_s *sc, struct bio * we're operating on complete aligned pages). */ offs = bp->bio_offset % PAGE_SIZE; + firstp = bp->bio_offset / PAGE_SIZE; lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE; lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1; + vm_page_t ma[lastp - firstp + 1]; + rv = VM_PAGER_OK; VM_OBJECT_LOCK(sc->object); vm_object_pip_add(sc->object, 1); - for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) { + for (i = firstp; i <= lastp; i++) { len = ((i == lastp) ? lastend : PAGE_SIZE) - offs; - m = vm_page_grab(sc->object, i, - VM_ALLOC_NORMAL|VM_ALLOC_RETRY); + /* + * Write cleans pages of the buffer, give it a + * priority. + */ + m = vm_page_grab(sc->object, i, (bp->bio_cmd == BIO_WRITE ? + VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) | VM_ALLOC_RETRY); VM_OBJECT_UNLOCK(sc->object); + ma[i - firstp] = m; sched_pin(); sf = sf_buf_alloc(m, SFB_CPUPRIVATE); VM_OBJECT_LOCK(sc->object); @@ -683,6 +692,12 @@ printf("wire_count %d busy %d flags %x h } vm_object_pip_subtract(sc->object, 1); vm_object_set_writeable_dirty(sc->object); + if (rv != VM_PAGER_ERROR && bp->bio_cmd == BIO_WRITE && + vm_page_count_severe()) { + vm_page_lock_queues(); + vm_pageout_flush(ma, lastp - firstp + 1, IO_SYNC); + vm_page_unlock_queues(); + } VM_OBJECT_UNLOCK(sc->object); return (rv != VM_PAGER_ERROR ? 0 : ENOSPC); } Added: user/kib/vm6/sys/kern/kern_rangelock.c ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ user/kib/vm6/sys/kern/kern_rangelock.c Sat Feb 20 16:34:42 2010 (r204132) @@ -0,0 +1,186 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov <kib@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/rangelock.h> +#include <sys/systm.h> +#include <sys/vnode.h> + +uma_zone_t rl_entry_zone; + +static void +rangelock_sys_init(void) +{ + + rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); +} +SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, rangelock_sys_init, NULL); + +void +rangelock_init(struct rangelock *lock) +{ + + TAILQ_INIT(&lock->rl_waiters); + lock->rl_currdep = NULL; +} + +void +rangelock_destroy(struct rangelock *lock) +{ + + KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters")); +} + +static int +rangelock_incompatible(const struct rl_q_entry *e1, + const struct rl_q_entry *e2) +{ + + if ((e1->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ && + (e2->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ) + return (0); +#define IN_RANGE(a, e) (a >= e->rl_q_start && a < e->rl_q_end) + if (IN_RANGE(e1->rl_q_start, e2) || IN_RANGE(e2->rl_q_start, e1) || + IN_RANGE(e1->rl_q_end, e2) || IN_RANGE(e2->rl_q_end, e1)) + return (1); +#undef IN_RANGE + return (0); +} + +static void +rangelock_calc_block(struct rangelock *lock) +{ + struct rl_q_entry *entry, *entry1, *whead; + + if (lock->rl_currdep == TAILQ_FIRST(&lock->rl_waiters) && + lock->rl_currdep != NULL) + lock->rl_currdep = TAILQ_NEXT(lock->rl_currdep, rl_q_link); + for (entry = lock->rl_currdep; entry; + entry = TAILQ_NEXT(entry, rl_q_link)) { + TAILQ_FOREACH(entry1, &lock->rl_waiters, rl_q_link) { + if (rangelock_incompatible(entry, entry1)) + goto out; + if (entry1 == entry) + break; + } + } +out: + lock->rl_currdep = entry; + TAILQ_FOREACH(whead, &lock->rl_waiters, rl_q_link) { + if (whead == lock->rl_currdep) + break; + if (!(whead->rl_q_flags & RL_LOCK_GRANTED)) { + whead->rl_q_flags |= RL_LOCK_GRANTED; + wakeup(whead); + } + } +} + +static void +rangelock_unlock_vp_locked(struct vnode *vp, struct rl_q_entry *entry) +{ + + ASSERT_VI_LOCKED(vp, "rangelock"); + KASSERT(entry != vp->v_rl.rl_currdep, ("stuck currdep")); + TAILQ_REMOVE(&vp->v_rl.rl_waiters, entry, rl_q_link); + rangelock_calc_block(&vp->v_rl); + VI_UNLOCK(vp); + uma_zfree(rl_entry_zone, entry); +} + +void +rangelock_unlock(struct vnode *vp, void *cookie) +{ + struct rl_q_entry *entry; + + entry = cookie; + VI_LOCK(vp); + rangelock_unlock_vp_locked(vp, entry); +} + +void * +rangelock_unlock_range(struct vnode *vp, void *cookie, off_t base, size_t len) +{ + struct rl_q_entry *entry; + + entry = cookie; + VI_LOCK(vp); + KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED, ("XXX")); + KASSERT(entry->rl_q_start == base, ("XXX")); + KASSERT(entry->rl_q_end >= base + len, ("XXX")); + if (entry->rl_q_end == base + len) { + rangelock_unlock_vp_locked(vp, cookie); + return (NULL); + } + entry->rl_q_end = base + len; + rangelock_calc_block(&vp->v_rl); + VI_UNLOCK(vp); + return (cookie); +} + +static void * +rangelock_enqueue(struct vnode *vp, struct rl_q_entry *entry) +{ + + VI_LOCK(vp); + TAILQ_INSERT_TAIL(&vp->v_rl.rl_waiters, entry, rl_q_link); + if (vp->v_rl.rl_currdep == NULL) + vp->v_rl.rl_currdep = entry; + rangelock_calc_block(&vp->v_rl); + while (!(entry->rl_q_flags & RL_LOCK_GRANTED)) + msleep(entry, &vp->v_interlock, 0, "range", 0); + VI_UNLOCK(vp); + return (entry); +} + +void * +rangelock_rlock(struct vnode *vp, off_t base, size_t len) +{ + struct rl_q_entry *entry; + + entry = uma_zalloc(rl_entry_zone, M_WAITOK); + entry->rl_q_flags = RL_LOCK_READ; + entry->rl_q_start = base; + entry->rl_q_end = base + len; + return (rangelock_enqueue(vp, entry)); +} + +void * +rangelock_wlock(struct vnode *vp, off_t base, size_t len) +{ + struct rl_q_entry *entry; + + entry = uma_zalloc(rl_entry_zone, M_WAITOK); + entry->rl_q_flags = RL_LOCK_WRITE; + entry->rl_q_start = base; + entry->rl_q_end = base + len; + return (rangelock_enqueue(vp, entry)); +} Modified: user/kib/vm6/sys/kern/vfs_cluster.c ============================================================================== --- user/kib/vm6/sys/kern/vfs_cluster.c Sat Feb 20 16:32:33 2010 (r204131) +++ user/kib/vm6/sys/kern/vfs_cluster.c Sat Feb 20 16:34:42 2010 (r204132) @@ -71,8 +71,8 @@ static int write_behind = 1; SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0, "Cluster write-behind; 0: disable, 1: enable, 2: backed off"); -static int read_max = 8; -SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0, +int vfs_read_max = 8; +SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &vfs_read_max, 0, "Cluster read-ahead max block count"); /* Page expended to mark partially backed buffers */ @@ -109,7 +109,7 @@ cluster_read(vp, filesize, lblkno, size, */ racluster = vp->v_mount->mnt_iosize_max / size; maxra = seqcount; - maxra = min(read_max, maxra); + maxra = min(vfs_read_max, maxra); maxra = min(nbuf/8, maxra); if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize) maxra = (filesize / size) - lblkno; @@ -803,7 +803,9 @@ cluster_wbuild(vp, size, start_lbn, len) (tbp->b_bcount != tbp->b_bufsize) || (tbp->b_bcount != size) || (len == 1) || - ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) { + ((bp = (vp->v_vflag & VV_MD) ? + trypbuf(&cluster_pbuf_freecnt) : + getpbuf(&cluster_pbuf_freecnt)) == NULL)) { totalwritten += tbp->b_bufsize; bawrite(tbp); ++start_lbn; Modified: user/kib/vm6/sys/kern/vfs_default.c ============================================================================== --- user/kib/vm6/sys/kern/vfs_default.c Sat Feb 20 16:32:33 2010 (r204131) +++ user/kib/vm6/sys/kern/vfs_default.c Sat Feb 20 16:34:42 2010 (r204132) @@ -77,6 +77,8 @@ static int dirent_exists(struct vnode *v #define DIRENT_MINSIZE (sizeof(struct dirent) - (MAXNAMLEN+1) + 4) +static int vop_stdextend(struct vop_extend_args *ap); + /* * This vnode table stores what we want to do if the filesystem doesn't * implement a particular VOP. @@ -118,6 +120,7 @@ struct vop_vector default_vnodeops = { .vop_unlock = vop_stdunlock, .vop_vptocnp = vop_stdvptocnp, .vop_vptofh = vop_stdvptofh, + .vop_extend = vop_stdextend, }; /* @@ -825,6 +828,23 @@ out: return (error); } +static int +vop_stdextend(struct vop_extend_args *ap) +{ + struct vattr vattr, oattr; + int error; + + + error = VOP_GETATTR(ap->a_vp, &oattr, ap->a_cred); + if (error != 0) + return (error); + if (oattr.va_size >= ap->a_size) + return (0); + VATTR_NULL(&vattr); + vattr.va_size = ap->a_size; + return (VOP_SETATTR(ap->a_vp, &vattr, ap->a_cred)); +} + /* * vfs default ops * used to fill the vfs function table to get reasonable default return values. Modified: user/kib/vm6/sys/kern/vfs_subr.c ============================================================================== --- user/kib/vm6/sys/kern/vfs_subr.c Sat Feb 20 16:32:33 2010 (r204131) +++ user/kib/vm6/sys/kern/vfs_subr.c Sat Feb 20 16:34:42 2010 (r204132) @@ -861,6 +861,7 @@ vdestroy(struct vnode *vp) /* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */ vp->v_op = NULL; #endif + rangelock_destroy(&vp->v_rl); lockdestroy(vp->v_vnlock); mtx_destroy(&vp->v_interlock); mtx_destroy(BO_MTX(bo)); @@ -1015,6 +1016,7 @@ alloc: if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0) vp->v_vflag |= VV_NOKNOTE; } + rangelock_init(&vp->v_rl); *vpp = vp; return (0); Modified: user/kib/vm6/sys/kern/vfs_vnops.c ============================================================================== --- user/kib/vm6/sys/kern/vfs_vnops.c Sat Feb 20 16:32:33 2010 (r204131) +++ user/kib/vm6/sys/kern/vfs_vnops.c Sat Feb 20 16:34:42 2010 (r204132) @@ -37,12 +37,14 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include <sys/types.h> #include <sys/param.h> #include <sys/systm.h> #include <sys/fcntl.h> #include <sys/file.h> #include <sys/kdb.h> #include <sys/stat.h> +#include <sys/sysctl.h> #include <sys/priv.h> #include <sys/proc.h> #include <sys/limits.h> @@ -62,6 +64,13 @@ __FBSDID("$FreeBSD$"); #include <security/mac/mac_framework.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> + +static int vmio_enabled = 1; +SYSCTL_INT(_vfs, OID_AUTO, vmio_enabled, CTLFLAG_RW, &vmio_enabled, 0, + "Use vm pages copyin/out instead of vops for read/write"); + static fo_rdwr_t vn_read; static fo_rdwr_t vn_write; static fo_truncate_t vn_truncate; @@ -83,6 +92,9 @@ struct fileops vnops = { .fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE }; +static int vn_write_chunk(struct vnode *, struct uio *, struct ucred *, + struct ucred *, int); + int vn_open(ndp, flagp, cmode, fp) struct nameidata *ndp; @@ -275,17 +287,14 @@ vn_writechk(vp) * Vnode close call */ int -vn_close(vp, flags, file_cred, td) - register struct vnode *vp; - int flags; - struct ucred *file_cred; - struct thread *td; +vn_close(struct vnode *vp, int flags, struct ucred *file_cred, + struct thread *td) { - struct mount *mp; + struct mount *mp, *mp1; int error, lock_flags; - if (!(flags & FWRITE) && vp->v_mount != NULL && - vp->v_mount->mnt_kern_flag & MNTK_EXTENDED_SHARED) + if (!(flags & FWRITE) && (mp1 = vp->v_mount) != NULL && + MNT_SHARED_WRITES(mp1)) lock_flags = LK_SHARED; else lock_flags = LK_EXCLUSIVE; @@ -333,7 +342,7 @@ sequential_heuristic(struct uio *uio, st * closely related to the best I/O size for real disks than * to any block size used by software. */ - fp->f_seqcount += howmany(uio->uio_resid, 16384); + fp->f_seqcount += howmany(uio->uio_resid, FRA_BLOCK_SZ); if (fp->f_seqcount > IO_SEQMAX) fp->f_seqcount = IO_SEQMAX; return (fp->f_seqcount << IO_SEQSHIFT); @@ -351,76 +360,71 @@ sequential_heuristic(struct uio *uio, st * Package up an I/O request on a vnode into a uio and do it. */ int -vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred, - aresid, td) - enum uio_rw rw; - struct vnode *vp; - void *base; - int len; - off_t offset; - enum uio_seg segflg; - int ioflg; - struct ucred *active_cred; - struct ucred *file_cred; - int *aresid; - struct thread *td; +vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset, + enum uio_seg segflg, int ioflg, struct ucred *active_cred, + struct ucred *file_cred, int *aresid, struct thread *td) { struct uio auio; struct iovec aiov; struct mount *mp; struct ucred *cred; + void *rl_cookie; int error, lock_flags; VFS_ASSERT_GIANT(vp->v_mount); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = base; + aiov.iov_len = len; + auio.uio_resid = len; + auio.uio_offset = offset; + auio.uio_segflg = segflg; + auio.uio_rw = rw; + auio.uio_td = td; + error = 0; + if ((ioflg & IO_NODELOCKED) == 0) { + if (rw == UIO_READ) + rl_cookie = rangelock_rlock(vp, offset, len); + else + rl_cookie = rangelock_wlock(vp, offset, len); mp = NULL; if (rw == UIO_WRITE) { if (vp->v_type != VCHR && (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) - return (error); + goto out; if (MNT_SHARED_WRITES(mp) || - ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) { + ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) lock_flags = LK_SHARED; - } else { + else lock_flags = LK_EXCLUSIVE; - } - vn_lock(vp, lock_flags | LK_RETRY); } else - vn_lock(vp, LK_SHARED | LK_RETRY); + lock_flags = LK_SHARED; + vn_lock(vp, lock_flags | LK_RETRY); + } else + rl_cookie = NULL; - } ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held"); - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - aiov.iov_base = base; - aiov.iov_len = len; - auio.uio_resid = len; - auio.uio_offset = offset; - auio.uio_segflg = segflg; - auio.uio_rw = rw; - auio.uio_td = td; - error = 0; #ifdef MAC if ((ioflg & IO_NOMACCHECK) == 0) { - if (rw == UIO_READ) - error = mac_vnode_check_read(active_cred, file_cred, - vp); - else + if (rw == UIO_WRITE) error = mac_vnode_check_write(active_cred, file_cred, vp); } #endif if (error == 0) { - if (file_cred) + if (file_cred != NULL) cred = file_cred; else cred = active_cred; if (rw == UIO_READ) - error = VOP_READ(vp, &auio, ioflg, cred); + error = vn_read_chunk(vp, &auio, active_cred, cred, + ioflg | IO_NODELOCKED); else - error = VOP_WRITE(vp, &auio, ioflg, cred); + error = vn_write_chunk(vp, &auio, active_cred, cred, + ioflg | IO_NODELOCKED); } if (aresid) *aresid = auio.uio_resid; @@ -428,10 +432,13 @@ vn_rdwr(rw, vp, base, len, offset, segfl if (auio.uio_resid && error == 0) error = EIO; if ((ioflg & IO_NODELOCKED) == 0) { - if (rw == UIO_WRITE && vp->v_type != VCHR) - vn_finished_write(mp); VOP_UNLOCK(vp, 0); + if (mp != NULL) + vn_finished_write(mp); } + out: + if (rl_cookie != NULL) + rangelock_unlock(vp, rl_cookie); return (error); } @@ -493,68 +500,148 @@ vn_rdwr_inchunks(rw, vp, base, len, offs return (error); } +static struct mtx * +vn_lock_foffset(struct file *fp) +{ + struct mtx *mtxp; + + mtxp = mtx_pool_find(mtxpool_sleep, fp); + mtx_lock(mtxp); + while (fp->f_vnread_flags & FOFFSET_LOCKED) { + fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; + msleep(&fp->f_vnread_flags, mtxp, PUSER -1, + "vnread offlock", 0); + } + fp->f_vnread_flags |= FOFFSET_LOCKED; + mtx_unlock(mtxp); + return (mtxp); +} + +static void +vn_unlock_foffset(struct file *fp, struct mtx *mtxp) +{ + + mtx_lock(mtxp); + if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) + wakeup(&fp->f_vnread_flags); + fp->f_vnread_flags = 0; + mtx_unlock(mtxp); +} + +int +vn_read_chunk(struct vnode *vp, struct uio *uio, struct ucred *active_cred, + struct ucred *fcred, int ioflag) +{ + int error, vfslocked; + + error = 0; + vfslocked = 0; /* gcc */ + + if ((ioflag & IO_NODELOCKED) == 0) { + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vn_lock(vp, LK_SHARED | LK_RETRY); + } + +#ifdef MAC + if ((ioflag & IO_NOMACCHECK) == 0) + error = mac_vnode_check_read(active_cred, fcred, vp); +#endif + if (error == 0) { + if (!vmio_enabled || + (error = vnode_pager_read(vp, uio, ioflag)) == EOPNOTSUPP) + error = VOP_READ(vp, uio, ioflag, fcred); + } + if ((ioflag & IO_NODELOCKED) == 0) { + VOP_UNLOCK(vp, 0); + VFS_UNLOCK_GIANT(vfslocked); + } + return (error); +} + /* * File table vnode read routine. */ static int -vn_read(fp, uio, active_cred, flags, td) - struct file *fp; - struct uio *uio; - struct ucred *active_cred; - struct thread *td; - int flags; +vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, + struct thread *td) { struct vnode *vp; - int error, ioflag; struct mtx *mtxp; - int vfslocked; + void *rl_cookie; + int ioflag; + int error; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); - mtxp = NULL; - vp = fp->f_vnode; ioflag = 0; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; if (fp->f_flag & O_DIRECT) ioflag |= IO_DIRECT; - vfslocked = VFS_LOCK_GIANT(vp->v_mount); + vp = fp->f_vnode; + /* * According to McKusick the vn lock was protecting f_offset here. * It is now protected by the FOFFSET_LOCKED flag. */ if ((flags & FOF_OFFSET) == 0) { - mtxp = mtx_pool_find(mtxpool_sleep, fp); - mtx_lock(mtxp); - while(fp->f_vnread_flags & FOFFSET_LOCKED) { - fp->f_vnread_flags |= FOFFSET_LOCK_WAITING; - msleep(&fp->f_vnread_flags, mtxp, PUSER -1, - "vnread offlock", 0); - } - fp->f_vnread_flags |= FOFFSET_LOCKED; - mtx_unlock(mtxp); - vn_lock(vp, LK_SHARED | LK_RETRY); + mtxp = vn_lock_foffset(fp); uio->uio_offset = fp->f_offset; } else - vn_lock(vp, LK_SHARED | LK_RETRY); - + mtxp = NULL; /* gcc */ + if (vp->v_type == VREG) + rl_cookie = rangelock_rlock(vp, uio->uio_offset, + uio->uio_resid); + else + rl_cookie = NULL; ioflag |= sequential_heuristic(uio, fp); + error = vn_read_chunk(vp, uio, active_cred, fp->f_cred, ioflag); + fp->f_nextoff = uio->uio_offset; + if (rl_cookie != NULL) + rangelock_unlock(vp, rl_cookie); + if ((flags & FOF_OFFSET) == 0) { + fp->f_offset = uio->uio_offset; + vn_unlock_foffset(fp, mtxp); + } + return (error); +} + +static int +vn_write_chunk(struct vnode *vp, struct uio *uio, struct ucred *active_cred, + struct ucred *fcred, int ioflag) +{ + struct mount *mp, *mp1; + int error, lock_flags, vfslocked; + + mp = NULL; + vfslocked = VFS_LOCK_GIANT(vp->v_mount); + if (vp->v_type == VREG) + bwillwrite(); + if (vp->v_type != VCHR && + (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + goto unlock; + if (MNT_SHARED_WRITES(mp) || + (mp == NULL && (mp1 = vp->v_mount) != NULL && + MNT_SHARED_WRITES(mp1))) + lock_flags = LK_SHARED; + else + lock_flags = LK_EXCLUSIVE; + vn_lock(vp, lock_flags | LK_RETRY); #ifdef MAC - error = mac_vnode_check_read(active_cred, fp->f_cred, vp); - if (error == 0) + error = mac_vnode_check_write(active_cred, fcred, vp); +#else + error = 0; #endif - error = VOP_READ(vp, uio, ioflag, fp->f_cred); - if ((flags & FOF_OFFSET) == 0) { - fp->f_offset = uio->uio_offset; - mtx_lock(mtxp); - if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING) - wakeup(&fp->f_vnread_flags); - fp->f_vnread_flags = 0; - mtx_unlock(mtxp); + if (error == 0) { + if (!vmio_enabled || + (error = vnode_pager_write(vp, uio, ioflag)) == EOPNOTSUPP) + error = VOP_WRITE(vp, uio, ioflag, fcred); } - fp->f_nextoff = uio->uio_offset; VOP_UNLOCK(vp, 0); + if (vp->v_type != VCHR) + vn_finished_write(mp); +unlock: VFS_UNLOCK_GIANT(vfslocked); return (error); } @@ -563,24 +650,17 @@ vn_read(fp, uio, active_cred, flags, td) * File table vnode write routine. */ static int -vn_write(fp, uio, active_cred, flags, td) - struct file *fp; - struct uio *uio; - struct ucred *active_cred; - struct thread *td; - int flags; +vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags, + struct thread *td) { struct vnode *vp; - struct mount *mp; - int error, ioflag, lock_flags; - int vfslocked; + struct mtx *mtxp; + void *rl_cookie; + int error, ioflag; KASSERT(uio->uio_td == td, ("uio_td %p is not td %p", uio->uio_td, td)); vp = fp->f_vnode; - vfslocked = VFS_LOCK_GIANT(vp->v_mount); - if (vp->v_type == VREG) - bwillwrite(); ioflag = IO_UNIT; if (vp->v_type == VREG && (fp->f_flag & O_APPEND)) ioflag |= IO_APPEND; @@ -591,36 +671,32 @@ vn_write(fp, uio, active_cred, flags, td if ((fp->f_flag & O_FSYNC) || (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) ioflag |= IO_SYNC; - mp = NULL; - if (vp->v_type != VCHR && - (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) - goto unlock; - - if ((MNT_SHARED_WRITES(mp) || - ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) && - (flags & FOF_OFFSET) != 0) { - lock_flags = LK_SHARED; - } else { - lock_flags = LK_EXCLUSIVE; - } - - vn_lock(vp, lock_flags | LK_RETRY); - if ((flags & FOF_OFFSET) == 0) + if ((flags & FOF_OFFSET) == 0) { + mtxp = vn_lock_foffset(fp); uio->uio_offset = fp->f_offset; + } else + mtxp = NULL; /* gcc */ ioflag |= sequential_heuristic(uio, fp); -#ifdef MAC - error = mac_vnode_check_write(active_cred, fp->f_cred, vp); - if (error == 0) -#endif - error = VOP_WRITE(vp, uio, ioflag, fp->f_cred); - if ((flags & FOF_OFFSET) == 0) + if (vp->v_type == VREG) { + if ((ioflag & IO_APPEND) || !(flags & FOF_OFFSET)) + /* + * For appenders, punt and lock the whole + * range. It also protects f_offset. + */ + rl_cookie = rangelock_wlock(vp, 0, (size_t)-1); + else + rl_cookie = rangelock_wlock(vp, uio->uio_offset, + uio->uio_resid); + } else + rl_cookie = NULL; + error = vn_write_chunk(vp, uio, active_cred, fp->f_cred, ioflag); + if (rl_cookie != NULL) + rangelock_unlock(vp, rl_cookie); + if ((flags & FOF_OFFSET) == 0) { fp->f_offset = uio->uio_offset; + vn_unlock_foffset(fp, mtxp); + } fp->f_nextoff = uio->uio_offset; - VOP_UNLOCK(vp, 0); - if (vp->v_type != VCHR) - vn_finished_write(mp); -unlock: - VFS_UNLOCK_GIANT(vfslocked); return (error); } @@ -628,25 +704,29 @@ unlock: * File table truncate routine. */ static int -vn_truncate(fp, length, active_cred, td) - struct file *fp; - off_t length; - struct ucred *active_cred; - struct thread *td; +vn_truncate(struct file *fp, off_t length, struct ucred *active_cred, + struct thread *td) { struct vattr vattr; struct mount *mp; struct vnode *vp; + void *rl_cookie; int vfslocked; int error; vp = fp->f_vnode; + + /* + * Lock the range where the shortening take place. Increase of + * file size does not need rangelock, but it is faster to lock + * the range then call VOP_GETATTR to get the current size and + * deal with races. + */ + rl_cookie = rangelock_wlock(vp, length, -1); vfslocked = VFS_LOCK_GIANT(vp->v_mount); error = vn_start_write(vp, &mp, V_WAIT | PCATCH); - if (error) { - VFS_UNLOCK_GIANT(vfslocked); - return (error); - } + if (error) + goto out1; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); if (vp->v_type == VDIR) { error = EISDIR; @@ -666,7 +746,9 @@ vn_truncate(fp, length, active_cred, td) out: VOP_UNLOCK(vp, 0); vn_finished_write(mp); +out1: VFS_UNLOCK_GIANT(vfslocked); + rangelock_unlock(vp, rl_cookie); return (error); } Modified: user/kib/vm6/sys/kern/vnode_if.src ============================================================================== --- user/kib/vm6/sys/kern/vnode_if.src Sat Feb 20 16:32:33 2010 (r204131) +++ user/kib/vm6/sys/kern/vnode_if.src Sat Feb 20 16:34:42 2010 (r204132) @@ -611,3 +611,12 @@ vop_vptocnp { INOUT char *buf; INOUT int *buflen; }; + +%% extend vp L L L + +vop_extend { + IN struct vnode *vp; + IN struct ucred *cred; + IN u_quad_t size; + IN int flags; +}; Modified: user/kib/vm6/sys/sys/buf.h ============================================================================== --- user/kib/vm6/sys/sys/buf.h Sat Feb 20 16:32:33 2010 (r204131) +++ user/kib/vm6/sys/sys/buf.h Sat Feb 20 16:34:42 2010 (r204132) @@ -257,6 +257,8 @@ extern const char *buf_wmesg; /* Defaul #include <sys/proc.h> /* XXX for curthread */ #include <sys/mutex.h> +extern int vfs_read_max; + /* * Initialize a lock. */ Modified: user/kib/vm6/sys/sys/file.h ============================================================================== --- user/kib/vm6/sys/sys/file.h Sat Feb 20 16:32:33 2010 (r204131) +++ user/kib/vm6/sys/sys/file.h Sat Feb 20 16:34:42 2010 (r204132) @@ -141,6 +141,8 @@ struct file { #define FOFFSET_LOCKED 0x1 #define FOFFSET_LOCK_WAITING 0x2 +#define FRA_BLOCK_SZ 16384 + #endif /* _KERNEL || _WANT_FILE */ /* Modified: user/kib/vm6/sys/sys/proc.h ============================================================================== --- user/kib/vm6/sys/sys/proc.h Sat Feb 20 16:32:33 2010 (r204131) +++ user/kib/vm6/sys/sys/proc.h Sat Feb 20 16:34:42 2010 (r204132) @@ -354,7 +354,7 @@ do { \ #define TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */ #define TDP_ALTSTACK 0x00000020 /* Have alternate signal stack. */ #define TDP_DEADLKTREAT 0x00000040 /* Lock aquisition - deadlock treatment. */ -#define TDP_UNUSED80 0x00000080 /* available. */ +#define TDP_VMIO 0x00000080 /* Busied pages for vnode_pager io. */ #define TDP_NOSLEEPING 0x00000100 /* Thread is not allowed to sleep on a sq. */ #define TDP_OWEUPC 0x00000200 /* Call addupc() at next AST. */ #define TDP_ITHREAD 0x00000400 /* Thread is an interrupt thread. */ Added: user/kib/vm6/sys/sys/rangelock.h ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ user/kib/vm6/sys/sys/rangelock.h Sat Feb 20 16:34:42 2010 (r204132) @@ -0,0 +1,48 @@ +/*- + * Copyright (c) 2009 Konstantin Belousov <kib@FreeBSD.org> + * All rights reserved. + * + * $FreeBSD$ + */ + +#ifndef _SYS_RANGELOCK_H +#define _SYS_RANGELOCK_H + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/lock.h> +#include <sys/queue.h> +#include <sys/sx.h> + +#ifdef _KERNEL + +struct vnode; + +struct rl_q_entry +{ + TAILQ_ENTRY(rl_q_entry) rl_q_link; + size_t rl_q_start, rl_q_end; + int rl_q_flags; +}; + +#define RL_LOCK_READ 0x0001 +#define RL_LOCK_WRITE 0x0002 +#define RL_LOCK_TYPE_MASK 0x0003 +#define RL_LOCK_GRANTED 0x0004 + +struct rangelock +{ + TAILQ_HEAD(, rl_q_entry) rl_waiters; + struct rl_q_entry *rl_currdep; +}; + +void rangelock_init(struct rangelock *lock); +void rangelock_destroy(struct rangelock *lock); +void rangelock_unlock(struct vnode *vp, void *cookie); +void *rangelock_unlock_range(struct vnode *vp, void *cookie, off_t base, + size_t len); +void *rangelock_rlock(struct vnode *vp, off_t base, size_t len); +void *rangelock_wlock(struct vnode *vp, off_t base, size_t len); +#endif + +#endif Modified: user/kib/vm6/sys/sys/vnode.h ============================================================================== --- user/kib/vm6/sys/sys/vnode.h Sat Feb 20 16:32:33 2010 (r204131) +++ user/kib/vm6/sys/sys/vnode.h Sat Feb 20 16:34:42 2010 (r204132) @@ -38,6 +38,7 @@ #include <sys/lock.h> #include <sys/lockmgr.h> #include <sys/mutex.h> +#include <sys/rangelock.h> #include <sys/selinfo.h> #include <sys/uio.h> #include <sys/acl.h> @@ -168,7 +169,8 @@ struct vnode { */ struct vpollinfo *v_pollinfo; /* G Poll events, p for *v_pi */ struct label *v_label; /* MAC label for vnode */ - struct lockf *v_lockf; /* Byte-level lock list */ + struct lockf *v_lockf; /* Byte-level adv lock list */ + struct rangelock v_rl; /* Byte-range lock */ }; *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***