Date: Wed, 16 May 2001 22:18:25 +0200 From: Tor.Egge@fast.no To: dillon@earth.backplane.com Cc: arch@FreeBSD.ORG Subject: Re: on load control / process swapping Message-ID: <200105162018.WAA99982@midten.fast.no> In-Reply-To: Your message of "Wed, 16 May 2001 11:01:24 -0700 (PDT)" References: <200105161801.f4GI1Oc73283@earth.backplane.com>
next in thread | previous in thread | raw e-mail | index | archive | help
>
> I think someone tried to implement O_DIRECT a while back, but it
> was fairly complex to try to do away with caching entirely.
>
> I think our best bet to 'start' an implementation of O_DIRECT is
> to support the flag in open() and fcntl(), and have it simply
> modify the sequential detection heuristic to throw away pages
> and buffers rather then simply depressing their priority.
>
> Eventually we can implement the direct-I/O piece of the equation.
>
> I could do this first part in an hour, I think. When I get home....
I've used something like the following patch since FreeBSD 3.3-STABLE.
On a Dell 2450 machine running a FreeBSD 4.3-RELEASE SMP kernel it
increases idle time from 0% to 95% when running a test program with
100 threads that each reads 256K from random sector aligned locations
in a 10 GB file. Read speed is increased from 120 MB/s to 160 MB/s.
This implementation is not semantically correct since it doesn't check
for dirty pages in the vm object.
Index: sys/sys/vnode.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/vnode.h,v
retrieving revision 1.150
diff -u -r1.150 vnode.h
--- sys/sys/vnode.h 2001/05/01 08:34:44 1.150
+++ sys/sys/vnode.h 2001/05/09 16:09:32
@@ -220,6 +220,7 @@
#define IO_VMIO 0x20 /* data already in VMIO space */
#define IO_INVAL 0x40 /* invalidate after I/O */
#define IO_ASYNC 0x80 /* bawrite rather then bdwrite */
+#define IO_NOBUFFER 0x100 /* bypass buffer cache */
/*
* Modes. Some values same as Ixxx entries from inode.h for now.
Index: sys/sys/file.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/file.h,v
retrieving revision 1.28
diff -u -r1.28 file.h
--- sys/sys/file.h 2001/02/15 16:34:10 1.28
+++ sys/sys/file.h 2001/02/15 19:14:53
@@ -56,7 +56,7 @@
*/
struct file {
LIST_ENTRY(file) f_list;/* list of active files */
- short f_flag; /* see fcntl.h */
+ int f_flag; /* see fcntl.h */
#define DTYPE_VNODE 1 /* file */
#define DTYPE_SOCKET 2 /* communications endpoint */
#define DTYPE_PIPE 3 /* pipe */
Index: sys/sys/fcntl.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/fcntl.h,v
retrieving revision 1.10
diff -u -r1.10 fcntl.h
--- sys/sys/fcntl.h 2000/04/22 15:22:21 1.10
+++ sys/sys/fcntl.h 2000/04/25 19:33:55
@@ -98,15 +98,18 @@
/* Defined by POSIX 1003.1; BSD default, but must be distinct from O_RDONLY. */
#define O_NOCTTY 0x8000 /* don't assign controlling terminal */
+/* Bypass buffer cache */
+#define O_DIRECT 0x00010000
+
#ifdef _KERNEL
/* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */
#define FFLAGS(oflags) ((oflags) + 1)
#define OFLAGS(fflags) ((fflags) - 1)
/* bits to save after open */
-#define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK)
+#define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK|O_DIRECT)
/* bits settable by fcntl(F_SETFL, ...) */
-#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM)
+#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM|O_DIRECT)
#endif
/*
Index: sys/kern/vfs_vnops.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_vnops.c,v
retrieving revision 1.116
diff -u -r1.116 vfs_vnops.c
--- sys/kern/vfs_vnops.c 2001/04/29 02:44:49 1.116
+++ sys/kern/vfs_vnops.c 2001/05/09 16:09:02
@@ -352,6 +360,10 @@
ioflag = 0;
if (fp->f_flag & FNONBLOCK)
ioflag |= IO_NDELAY;
+#ifdef DIRECTIO
+ if (fp->f_flag & O_DIRECT)
+ ioflag |= IO_NOBUFFER;
+#endif
VOP_LEASE(vp, p, cred, LEASE_READ);
vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
if ((flags & FOF_OFFSET) == 0)
Index: sys/ufs/ufs/ufs_readwrite.c
===================================================================
RCS file: /home/ncvs/src/sys/ufs/ufs/ufs_readwrite.c,v
retrieving revision 1.77
diff -u -r1.77 ufs_readwrite.c
--- sys/ufs/ufs/ufs_readwrite.c 2001/05/01 08:34:45 1.77
+++ sys/ufs/ufs/ufs_readwrite.c 2001/05/09 16:09:33
@@ -42,6 +42,12 @@
#define WRITE ffs_write
#define WRITE_S "ffs_write"
+#ifdef DIRECTIO
+extern int allowrawread;
+extern int ffs_rawread __P((struct vnode *vp,
+ struct uio *uio));
+#endif
+
#include <vm/vm.h>
#include <vm/vm_object.h>
#include <vm/vm_pager.h>
@@ -86,6 +92,14 @@
mode = ip->i_mode;
uio = ap->a_uio;
ioflag = ap->a_ioflag;
+#ifdef DIRECTIO
+ if ((ioflag & IO_NOBUFFER) != 0 && allowrawread != 0 &&
+ uio->uio_iovcnt == 1 &&
+ (uio->uio_offset & (DEV_BSIZE - 1)) == 0 &&
+ (uio->uio_resid & (DEV_BSIZE - 1)) == 0 &&
+ uio->uio_resid == uio->uio_iov->iov_len)
+ return ffs_rawread(vp, uio);
+#endif
#ifdef DIAGNOSTIC
if (uio->uio_rw != UIO_READ)
@@ -251,7 +265,7 @@
* doing sequential access.
*/
error = cluster_read(vp, ip->i_size, lbn,
- size, NOCRED, uio->uio_resid, seqcount, &bp);
+ size, NOCRED, blkoffset + uio->uio_resid, seqcount, &bp);
else if (seqcount > 1) {
/*
* If we are NOT allowed to cluster, then
--- /dev/null Wed May 16 21:49:24 2001
+++ sys/ufs/ufs/ufs_rawread.c Sun Nov 26 06:01:31 2000
@@ -0,0 +1,307 @@
+/*-
+ * Copyright (c) 2000 Tor Egge
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD:$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/vnode.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+
+#include <machine/limits.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+static int ffs_rawread_readahead __P((struct vnode *vp,
+ caddr_t udata,
+ off_t offset,
+ size_t len,
+ struct proc *p,
+ struct buf *bp,
+ caddr_t sa));
+int ffs_rawread __P((struct vnode *vp,
+ struct uio *uio));
+
+static void ffs_rawreadwakeup __P((struct buf *bp));
+
+
+static int rawbufcnt = 350;
+SYSCTL_INT(_debug, OID_AUTO, rawbufcnt, CTLFLAG_RD, &rawbufcnt, 0, "");
+
+unsigned long allowrawread = 1;
+SYSCTL_INT(_debug, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, "");
+
+static unsigned long rawreadahead = 1;
+SYSCTL_INT(_debug, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, "");
+
+static int
+ffs_rawread_readahead(vp, udata, offset, len, p, bp, sa)
+ struct vnode *vp;
+ caddr_t udata;
+ off_t offset;
+ size_t len;
+ struct proc *p;
+ struct buf *bp;
+ caddr_t sa;
+{
+ int error;
+ u_int iolen;
+ off_t blockno;
+ int blockoff;
+ int bsize;
+ struct vnode *dp;
+ int bforwards;
+
+ bsize = vp->v_mount->mnt_stat.f_iosize;
+
+ iolen = ((vm_offset_t) udata) & PAGE_MASK;
+ bp->b_bcount = len;
+ if (bp->b_bcount + iolen > bp->b_kvasize) {
+ bp->b_bcount = bp->b_kvasize;
+ if (iolen != 0)
+ bp->b_bcount -= PAGE_SIZE;
+ }
+ bp->b_flags = B_PHYS;
+ bp->b_iocmd = BIO_READ;
+ bp->b_iodone = ffs_rawreadwakeup;
+ bp->b_data = udata;
+ bp->b_saveaddr = sa;
+ bp->b_offset = offset;
+ blockno = bp->b_offset / bsize;
+ blockoff = (bp->b_offset % bsize) / DEV_BSIZE;
+ if ((daddr_t) blockno != blockno) {
+ return EINVAL; /* blockno overflow */
+ }
+
+ bp->b_lblkno = bp->b_blkno = blockno;
+ if (!useracc(bp->b_data, bp->b_bcount, VM_PROT_WRITE)) {
+ return EFAULT;
+ }
+
+ error = VOP_BMAP(vp, bp->b_lblkno, &dp, &bp->b_blkno, &bforwards,
+ NULL);
+ if (error != 0) {
+ return error;
+ }
+
+ if (bp->b_bcount + blockoff * DEV_BSIZE > bsize * (1 + bforwards))
+ bp->b_bcount = bsize * (1 + bforwards) - blockoff * DEV_BSIZE;
+ bp->b_bufsize = bp->b_bcount;
+ bp->b_blkno += blockoff;
+ bp->b_dev = dp->v_rdev;
+
+ vmapbuf(bp);
+
+ (void) VOP_STRATEGY(dp, bp);
+ return 0;
+}
+
+int
+ffs_rawread(vp, uio)
+ struct vnode *vp;
+ struct uio *uio;
+{
+ int error, nerror;
+ struct buf *bp, *nbp, *tbp;
+ caddr_t sa, nsa, tsa;
+ u_int iolen;
+ int spl;
+ caddr_t udata;
+ long resid;
+ off_t offset;
+ struct proc *p;
+
+ udata = uio->uio_iov->iov_base;
+ resid = uio->uio_resid;
+ offset = uio->uio_offset;
+ p = uio->uio_procp ? uio->uio_procp : curproc;
+
+ if ((offset % DEV_BSIZE) != 0 || (resid % DEV_BSIZE) != 0)
+ return EINVAL;
+
+ /*
+ * keep the process from being swapped
+ */
+ PHOLD(p);
+
+ error = 0;
+ nerror = 0;
+
+ bp = NULL;
+ nbp = NULL;
+ sa = NULL;
+ nsa = NULL;
+
+ while (resid > 0) {
+
+ if (bp == NULL) { /* Setup first read */
+ /* XXX: Leave some bufs for swap */
+ bp = getpbuf(&rawbufcnt);
+ sa = bp->b_data;
+ bp->b_vp = vp;
+ bp->b_error = 0;
+ error = ffs_rawread_readahead(vp, udata, offset,
+ resid, p, bp, sa);
+ if (error != 0)
+ break;
+
+ if (resid > bp->b_bufsize) { /* Setup fist readahead */
+ /* XXX: Leave bufs for swap */
+ if (rawreadahead != 0)
+ nbp = trypbuf(&rawbufcnt);
+ else
+ nbp = NULL;
+ if (nbp != NULL) {
+ nsa = nbp->b_data;
+ nbp->b_vp = vp;
+ nbp->b_error = 0;
+
+ nerror = ffs_rawread_readahead(vp,
+ udata +
+ bp->b_bufsize,
+ offset +
+ bp->b_bufsize,
+ resid -
+ bp->b_bufsize,
+ p,
+ nbp,
+ nsa);
+ if (nerror) {
+ relpbuf(nbp, &rawbufcnt);
+ nbp = NULL;
+ }
+ }
+ }
+ }
+
+ spl = splbio();
+ while ((bp->b_flags & B_DONE) == 0) {
+ tsleep((caddr_t)bp, PRIBIO, "rawrd", 0);
+ }
+ splx(spl);
+
+ vunmapbuf(bp);
+
+ iolen = bp->b_bcount - bp->b_resid;
+ if (iolen == 0 && (bp->b_ioflags & BIO_ERROR) == 0) {
+ nerror = 0; /* Ignore possible beyond EOF error */
+ break; /* EOF */
+ }
+
+ if ((bp->b_ioflags & BIO_ERROR) != 0) {
+ error = bp->b_error;
+ break;
+ }
+ resid -= iolen;
+ udata += iolen;
+ offset += iolen;
+ if (iolen < bp->b_bufsize) {
+ /* Incomplete read. Try to read remaining part */
+ error = ffs_rawread_readahead(vp,
+ udata,
+ offset,
+ bp->b_bufsize - iolen,
+ p,
+ bp,
+ sa);
+ if (error)
+ break;
+ } else if (nbp != NULL) { /* Complete read with readahead */
+
+ tbp = bp;
+ bp = nbp;
+ nbp = tbp;
+
+ tsa = sa;
+ sa = nsa;
+ nsa = tsa;
+
+ if (resid <= bp->b_bufsize) { /* No more readaheads */
+ relpbuf(nbp, &rawbufcnt);
+ nbp = NULL;
+ } else { /* Setup next readahead */
+ nerror = ffs_rawread_readahead(vp,
+ udata +
+ bp->b_bufsize,
+ offset +
+ bp->b_bufsize,
+ resid -
+ bp->b_bufsize,
+ p,
+ nbp,
+ nsa);
+ if (nerror != 0) {
+ relpbuf(nbp, &rawbufcnt);
+ nbp = NULL;
+ }
+ }
+ } else if (nerror != 0) {/* Deferred Readahead error */
+ break;
+ } else if (resid > 0) { /* More to read, no readahead */
+ error = ffs_rawread_readahead(vp, udata, offset,
+ resid, p, bp, sa);
+ if (error != 0)
+ break;
+ }
+ }
+
+ if (bp != NULL)
+ relpbuf(bp, &rawbufcnt);
+ if (nbp != NULL) { /* Run down readahead buffer */
+ spl = splbio();
+ while ((nbp->b_flags & B_DONE) == 0) {
+ tsleep((caddr_t)nbp, PRIBIO, "rawrd", 0);
+ }
+ splx(spl);
+ vunmapbuf(nbp);
+ relpbuf(nbp, &rawbufcnt);
+ }
+
+ if (error == 0)
+ error = nerror;
+ PRELE(p);
+ uio->uio_resid = resid;
+ return error;
+}
+
+static void
+ffs_rawreadwakeup(bp)
+ struct buf *bp;
+{
+ wakeup((caddr_t) bp);
+}
+
Index: sys/conf/options
===================================================================
RCS file: /home/ncvs/src/sys/conf/options,v
retrieving revision 1.271
diff -u -r1.271 options
--- sys/conf/options 2001/05/13 20:52:36 1.271
+++ sys/conf/options 2001/05/16 17:36:04
@@ -378,6 +380,7 @@
REGRESSION opt_global.h
SIMPLELOCK_DEBUG opt_global.h
VFS_BIO_DEBUG opt_global.h
+DIRECTIO opt_global.h
# These are VM related options
VM_KMEM_SIZE opt_vm.h
- Tor Egge
To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-arch" in the body of the message
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200105162018.WAA99982>
