From owner-freebsd-arch Wed May 16 13:18:59 2001 Delivered-To: freebsd-arch@freebsd.org Received: from midten.fast.no (midten.fast.no [213.188.8.11]) by hub.freebsd.org (Postfix) with ESMTP id AE34F37B424 for ; Wed, 16 May 2001 13:18:41 -0700 (PDT) (envelope-from Tor.Egge@fast.no) Received: from fast.no (IDENT:tegge@midten.fast.no [213.188.8.11]) by midten.fast.no (8.9.3/8.9.3) with ESMTP id WAA99982; Wed, 16 May 2001 22:18:25 +0200 (CEST) Message-Id: <200105162018.WAA99982@midten.fast.no> To: dillon@earth.backplane.com Cc: arch@FreeBSD.ORG Subject: Re: on load control / process swapping From: Tor.Egge@fast.no In-Reply-To: Your message of "Wed, 16 May 2001 11:01:24 -0700 (PDT)" References: <200105161801.f4GI1Oc73283@earth.backplane.com> X-Mailer: Mew version 1.70 on Emacs 19.34.1 Mime-Version: 1.0 Content-Type: Text/Plain; charset=us-ascii Content-Transfer-Encoding: 7bit Date: Wed, 16 May 2001 22:18:25 +0200 Sender: owner-freebsd-arch@FreeBSD.ORG Precedence: bulk X-Loop: FreeBSD.ORG > > I think someone tried to implement O_DIRECT a while back, but it > was fairly complex to try to do away with caching entirely. > > I think our best bet to 'start' an implementation of O_DIRECT is > to support the flag in open() and fcntl(), and have it simply > modify the sequential detection heuristic to throw away pages > and buffers rather then simply depressing their priority. > > Eventually we can implement the direct-I/O piece of the equation. > > I could do this first part in an hour, I think. When I get home.... I've used something like the following patch since FreeBSD 3.3-STABLE. On a Dell 2450 machine running a FreeBSD 4.3-RELEASE SMP kernel it increases idle time from 0% to 95% when running a test program with 100 threads that each reads 256K from random sector aligned locations in a 10 GB file. Read speed is increased from 120 MB/s to 160 MB/s. This implementation is not semantically correct since it doesn't check for dirty pages in the vm object. Index: sys/sys/vnode.h =================================================================== RCS file: /home/ncvs/src/sys/sys/vnode.h,v retrieving revision 1.150 diff -u -r1.150 vnode.h --- sys/sys/vnode.h 2001/05/01 08:34:44 1.150 +++ sys/sys/vnode.h 2001/05/09 16:09:32 @@ -220,6 +220,7 @@ #define IO_VMIO 0x20 /* data already in VMIO space */ #define IO_INVAL 0x40 /* invalidate after I/O */ #define IO_ASYNC 0x80 /* bawrite rather then bdwrite */ +#define IO_NOBUFFER 0x100 /* bypass buffer cache */ /* * Modes. Some values same as Ixxx entries from inode.h for now. Index: sys/sys/file.h =================================================================== RCS file: /home/ncvs/src/sys/sys/file.h,v retrieving revision 1.28 diff -u -r1.28 file.h --- sys/sys/file.h 2001/02/15 16:34:10 1.28 +++ sys/sys/file.h 2001/02/15 19:14:53 @@ -56,7 +56,7 @@ */ struct file { LIST_ENTRY(file) f_list;/* list of active files */ - short f_flag; /* see fcntl.h */ + int f_flag; /* see fcntl.h */ #define DTYPE_VNODE 1 /* file */ #define DTYPE_SOCKET 2 /* communications endpoint */ #define DTYPE_PIPE 3 /* pipe */ Index: sys/sys/fcntl.h =================================================================== RCS file: /home/ncvs/src/sys/sys/fcntl.h,v retrieving revision 1.10 diff -u -r1.10 fcntl.h --- sys/sys/fcntl.h 2000/04/22 15:22:21 1.10 +++ sys/sys/fcntl.h 2000/04/25 19:33:55 @@ -98,15 +98,18 @@ /* Defined by POSIX 1003.1; BSD default, but must be distinct from O_RDONLY. */ #define O_NOCTTY 0x8000 /* don't assign controlling terminal */ +/* Bypass buffer cache */ +#define O_DIRECT 0x00010000 + #ifdef _KERNEL /* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */ #define FFLAGS(oflags) ((oflags) + 1) #define OFLAGS(fflags) ((fflags) - 1) /* bits to save after open */ -#define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK) +#define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK|O_DIRECT) /* bits settable by fcntl(F_SETFL, ...) */ -#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM) +#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM|O_DIRECT) #endif /* Index: sys/kern/vfs_vnops.c =================================================================== RCS file: /home/ncvs/src/sys/kern/vfs_vnops.c,v retrieving revision 1.116 diff -u -r1.116 vfs_vnops.c --- sys/kern/vfs_vnops.c 2001/04/29 02:44:49 1.116 +++ sys/kern/vfs_vnops.c 2001/05/09 16:09:02 @@ -352,6 +360,10 @@ ioflag = 0; if (fp->f_flag & FNONBLOCK) ioflag |= IO_NDELAY; +#ifdef DIRECTIO + if (fp->f_flag & O_DIRECT) + ioflag |= IO_NOBUFFER; +#endif VOP_LEASE(vp, p, cred, LEASE_READ); vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p); if ((flags & FOF_OFFSET) == 0) Index: sys/ufs/ufs/ufs_readwrite.c =================================================================== RCS file: /home/ncvs/src/sys/ufs/ufs/ufs_readwrite.c,v retrieving revision 1.77 diff -u -r1.77 ufs_readwrite.c --- sys/ufs/ufs/ufs_readwrite.c 2001/05/01 08:34:45 1.77 +++ sys/ufs/ufs/ufs_readwrite.c 2001/05/09 16:09:33 @@ -42,6 +42,12 @@ #define WRITE ffs_write #define WRITE_S "ffs_write" +#ifdef DIRECTIO +extern int allowrawread; +extern int ffs_rawread __P((struct vnode *vp, + struct uio *uio)); +#endif + #include #include #include @@ -86,6 +92,14 @@ mode = ip->i_mode; uio = ap->a_uio; ioflag = ap->a_ioflag; +#ifdef DIRECTIO + if ((ioflag & IO_NOBUFFER) != 0 && allowrawread != 0 && + uio->uio_iovcnt == 1 && + (uio->uio_offset & (DEV_BSIZE - 1)) == 0 && + (uio->uio_resid & (DEV_BSIZE - 1)) == 0 && + uio->uio_resid == uio->uio_iov->iov_len) + return ffs_rawread(vp, uio); +#endif #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_READ) @@ -251,7 +265,7 @@ * doing sequential access. */ error = cluster_read(vp, ip->i_size, lbn, - size, NOCRED, uio->uio_resid, seqcount, &bp); + size, NOCRED, blkoffset + uio->uio_resid, seqcount, &bp); else if (seqcount > 1) { /* * If we are NOT allowed to cluster, then --- /dev/null Wed May 16 21:49:24 2001 +++ sys/ufs/ufs/ufs_rawread.c Sun Nov 26 06:01:31 2000 @@ -0,0 +1,307 @@ +/*- + * Copyright (c) 2000 Tor Egge + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD:$ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static int ffs_rawread_readahead __P((struct vnode *vp, + caddr_t udata, + off_t offset, + size_t len, + struct proc *p, + struct buf *bp, + caddr_t sa)); +int ffs_rawread __P((struct vnode *vp, + struct uio *uio)); + +static void ffs_rawreadwakeup __P((struct buf *bp)); + + +static int rawbufcnt = 350; +SYSCTL_INT(_debug, OID_AUTO, rawbufcnt, CTLFLAG_RD, &rawbufcnt, 0, ""); + +unsigned long allowrawread = 1; +SYSCTL_INT(_debug, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, ""); + +static unsigned long rawreadahead = 1; +SYSCTL_INT(_debug, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, ""); + +static int +ffs_rawread_readahead(vp, udata, offset, len, p, bp, sa) + struct vnode *vp; + caddr_t udata; + off_t offset; + size_t len; + struct proc *p; + struct buf *bp; + caddr_t sa; +{ + int error; + u_int iolen; + off_t blockno; + int blockoff; + int bsize; + struct vnode *dp; + int bforwards; + + bsize = vp->v_mount->mnt_stat.f_iosize; + + iolen = ((vm_offset_t) udata) & PAGE_MASK; + bp->b_bcount = len; + if (bp->b_bcount + iolen > bp->b_kvasize) { + bp->b_bcount = bp->b_kvasize; + if (iolen != 0) + bp->b_bcount -= PAGE_SIZE; + } + bp->b_flags = B_PHYS; + bp->b_iocmd = BIO_READ; + bp->b_iodone = ffs_rawreadwakeup; + bp->b_data = udata; + bp->b_saveaddr = sa; + bp->b_offset = offset; + blockno = bp->b_offset / bsize; + blockoff = (bp->b_offset % bsize) / DEV_BSIZE; + if ((daddr_t) blockno != blockno) { + return EINVAL; /* blockno overflow */ + } + + bp->b_lblkno = bp->b_blkno = blockno; + if (!useracc(bp->b_data, bp->b_bcount, VM_PROT_WRITE)) { + return EFAULT; + } + + error = VOP_BMAP(vp, bp->b_lblkno, &dp, &bp->b_blkno, &bforwards, + NULL); + if (error != 0) { + return error; + } + + if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards)) + bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE; + bp->b_bufsize = bp->b_bcount; + bp->b_blkno += blockoff; + bp->b_dev = dp->v_rdev; + + vmapbuf(bp); + + (void) VOP_STRATEGY(dp, bp); + return 0; +} + +int +ffs_rawread(vp, uio) + struct vnode *vp; + struct uio *uio; +{ + int error, nerror; + struct buf *bp, *nbp, *tbp; + caddr_t sa, nsa, tsa; + u_int iolen; + int spl; + caddr_t udata; + long resid; + off_t offset; + struct proc *p; + + udata = uio->uio_iov->iov_base; + resid = uio->uio_resid; + offset = uio->uio_offset; + p = uio->uio_procp ? uio->uio_procp : curproc; + + if ((offset % DEV_BSIZE) != 0 || (resid % DEV_BSIZE) != 0) + return EINVAL; + + /* + * keep the process from being swapped + */ + PHOLD(p); + + error = 0; + nerror = 0; + + bp = NULL; + nbp = NULL; + sa = NULL; + nsa = NULL; + + while (resid > 0) { + + if (bp == NULL) { /* Setup first read */ + /* XXX: Leave some bufs for swap */ + bp = getpbuf(&rawbufcnt); + sa = bp->b_data; + bp->b_vp = vp; + bp->b_error = 0; + error = ffs_rawread_readahead(vp, udata, offset, + resid, p, bp, sa); + if (error != 0) + break; + + if (resid > bp->b_bufsize) { /* Setup fist readahead */ + /* XXX: Leave bufs for swap */ + if (rawreadahead != 0) + nbp = trypbuf(&rawbufcnt); + else + nbp = NULL; + if (nbp != NULL) { + nsa = nbp->b_data; + nbp->b_vp = vp; + nbp->b_error = 0; + + nerror = ffs_rawread_readahead(vp, + udata + + bp->b_bufsize, + offset + + bp->b_bufsize, + resid - + bp->b_bufsize, + p, + nbp, + nsa); + if (nerror) { + relpbuf(nbp, &rawbufcnt); + nbp = NULL; + } + } + } + } + + spl = splbio(); + while ((bp->b_flags & B_DONE) == 0) { + tsleep((caddr_t)bp, PRIBIO, "rawrd", 0); + } + splx(spl); + + vunmapbuf(bp); + + iolen = bp->b_bcount - bp->b_resid; + if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) { + nerror = 0; /* Ignore possible beyond EOF error */ + break; /* EOF */ + } + + if ((bp->b_ioflags & BIO_ERROR) != 0) { + error = bp->b_error; + break; + } + resid -= iolen; + udata += iolen; + offset += iolen; + if (iolen < bp->b_bufsize) { + /* Incomplete read. Try to read remaining part */ + error = ffs_rawread_readahead(vp, + udata, + offset, + bp->b_bufsize - iolen, + p, + bp, + sa); + if (error) + break; + } else if (nbp != NULL) { /* Complete read with readahead */ + + tbp = bp; + bp = nbp; + nbp = tbp; + + tsa = sa; + sa = nsa; + nsa = tsa; + + if (resid <= bp->b_bufsize) { /* No more readaheads */ + relpbuf(nbp, &rawbufcnt); + nbp = NULL; + } else { /* Setup next readahead */ + nerror = ffs_rawread_readahead(vp, + udata + + bp->b_bufsize, + offset + + bp->b_bufsize, + resid - + bp->b_bufsize, + p, + nbp, + nsa); + if (nerror != 0) { + relpbuf(nbp, &rawbufcnt); + nbp = NULL; + } + } + } else if (nerror != 0) {/* Deferred Readahead error */ + break; + } else if (resid > 0) { /* More to read, no readahead */ + error = ffs_rawread_readahead(vp, udata, offset, + resid, p, bp, sa); + if (error != 0) + break; + } + } + + if (bp != NULL) + relpbuf(bp, &rawbufcnt); + if (nbp != NULL) { /* Run down readahead buffer */ + spl = splbio(); + while ((nbp->b_flags & B_DONE) == 0) { + tsleep((caddr_t)nbp, PRIBIO, "rawrd", 0); + } + splx(spl); + vunmapbuf(nbp); + relpbuf(nbp, &rawbufcnt); + } + + if (error == 0) + error = nerror; + PRELE(p); + uio->uio_resid = resid; + return error; +} + +static void +ffs_rawreadwakeup(bp) + struct buf *bp; +{ + wakeup((caddr_t) bp); +} + Index: sys/conf/options =================================================================== RCS file: /home/ncvs/src/sys/conf/options,v retrieving revision 1.271 diff -u -r1.271 options --- sys/conf/options 2001/05/13 20:52:36 1.271 +++ sys/conf/options 2001/05/16 17:36:04 @@ -378,6 +380,7 @@ REGRESSION opt_global.h SIMPLELOCK_DEBUG opt_global.h VFS_BIO_DEBUG opt_global.h +DIRECTIO opt_global.h # These are VM related options VM_KMEM_SIZE opt_vm.h - Tor Egge To Unsubscribe: send mail to majordomo@FreeBSD.org with "unsubscribe freebsd-arch" in the body of the message