Date: Wed, 16 May 2001 23:47:46 -0700 (PDT) From: Matt Dillon <dillon@earth.backplane.com> To: Tor.Egge@fast.no Cc: arch@FreeBSD.ORG Subject: Preliminary O_DIRECT patch (for review only, not yet tested!) Message-ID: <200105170647.f4H6lkk88458@earth.backplane.com> References: <200105162222.f4GMMpC81247@earth.backplane.com> <200105162331.BAA04708@midten.fast.no>
next in thread | previous in thread | raw e-mail | index | archive | help
This is my preliminary O_DIRECT patch so far, against -stable at the
moment (Obviously it will be committed to -current first, but I have
to test it on -stable). It seems to work for reads. It doesn't work
for writes yet (the buffers still get cached).
Basically it takes Tor's infrastructure with some minor modifications,
removes the rawread/rawwrite stuff, and then adds a B_DIRECT flag
to the buffer cache. write()'s are converted to synchronous writes,
and both read()s and write()s attempt to completely free the underlying
VM pages plus the buffer is released.
I need to figure out how to free underlying buffers/VM for write()
operations before I can commit any of this. It could be a while.
--
I've looked at the rawread/rawwrite issue and I believe it may be
possible to use the already-existing B_VMIO flag coupled with
some VM magic to achieve the equivalent in the buffer cache itself
rather then having to write a rawread/rawwrite function for each
filesystem. Filesystems already support B_VMIO. If it is possible,
then we'll have a general raw I/O solution.
-Matt
Index: kern/vfs_bio.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_bio.c,v
retrieving revision 1.242.2.7
diff -u -r1.242.2.7 vfs_bio.c
--- kern/vfs_bio.c 2001/03/02 16:45:12 1.242.2.7
+++ kern/vfs_bio.c 2001/05/17 04:21:37
@@ -1230,7 +1230,7 @@
/* unlock */
BUF_UNLOCK(bp);
- bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+ bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
splx(s);
}
@@ -1296,7 +1296,7 @@
/* unlock */
BUF_UNLOCK(bp);
- bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+ bp->b_flags &= ~(B_ORDERED | B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
splx(s);
}
@@ -1328,12 +1328,15 @@
vm_page_flag_clear(m, PG_ZERO);
/*
* Might as well free the page if we can and it has
- * no valid data.
+ * no valid data. We also free the page if the
+ * buffer was used for direct I/O
*/
if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) {
vm_page_busy(m);
vm_page_protect(m, VM_PROT_NONE);
vm_page_free(m);
+ } else if (bp->b_flags & B_DIRECT) {
+ vm_page_try_to_free(m);
} else if (vm_page_count_severe()) {
vm_page_try_to_cache(m);
}
@@ -2187,7 +2190,7 @@
}
splx(s);
- bp->b_flags &= ~B_DONE;
+ bp->b_flags &= ~(B_DONE | B_DIRECT);
} else {
/*
* Buffer is not in-core, create new buffer. The buffer
@@ -2267,7 +2270,7 @@
allocbuf(bp, size);
splx(s);
- bp->b_flags &= ~B_DONE;
+ bp->b_flags &= ~(B_DONE | B_DIRECT);
}
return (bp);
}
Index: kern/vfs_vnops.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_vnops.c,v
retrieving revision 1.87.2.6
diff -u -r1.87.2.6 vfs_vnops.c
--- kern/vfs_vnops.c 2001/02/26 04:23:16 1.87.2.6
+++ kern/vfs_vnops.c 2001/05/17 05:17:55
@@ -334,6 +334,8 @@
ioflag = 0;
if (fp->f_flag & FNONBLOCK)
ioflag |= IO_NDELAY;
+ if (fp->f_flag & O_DIRECT)
+ ioflag |= IO_DIRECT;
VOP_LEASE(vp, p, cred, LEASE_READ);
vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p);
if ((flags & FOF_OFFSET) == 0)
@@ -374,6 +376,8 @@
ioflag |= IO_APPEND;
if (fp->f_flag & FNONBLOCK)
ioflag |= IO_NDELAY;
+ if (fp->f_flag & O_DIRECT)
+ ioflag |= IO_DIRECT;
if ((fp->f_flag & O_FSYNC) ||
(vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
ioflag |= IO_SYNC;
Index: sys/buf.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/buf.h,v
retrieving revision 1.88.2.3
diff -u -r1.88.2.3 buf.h
--- sys/buf.h 2000/12/30 01:51:10 1.88.2.3
+++ sys/buf.h 2001/05/17 04:18:35
@@ -191,12 +191,14 @@
* if b_bufsize and b_bcount are not. ( b_bufsize is
* always at least DEV_BSIZE aligned, though ).
*
+ * B_DIRECT Hint (along with B_RELBUF) that we should attempt to
+ * completely free the pages underlying the buffer.
*/
#define B_AGE 0x00000001 /* Move to age queue when I/O done. */
#define B_NEEDCOMMIT 0x00000002 /* Append-write in progress. */
#define B_ASYNC 0x00000004 /* Start I/O, do not wait. */
-#define B_UNUSED0 0x00000008 /* Old B_BAD */
+#define B_DIRECT 0x00000008 /* direct I/O flag (pls free vmio) */
#define B_DEFERRED 0x00000010 /* Skipped over for cleaning */
#define B_CACHE 0x00000020 /* Bread found us in the cache. */
#define B_CALL 0x00000040 /* Call b_iodone from biodone. */
@@ -231,7 +233,7 @@
"\33paging\32xxx\31writeinprog\30want\27relbuf\26dirty" \
"\25read\24raw\23phys\22clusterok\21malloc\20nocache" \
"\17locked\16inval\15scanned\14error\13eintr\12done\11freebuf" \
- "\10delwri\7call\6cache\4bad\3async\2needcommit\1age"
+ "\10delwri\7call\6cache\4direct\3async\2needcommit\1age"
/*
* These flags are kept in b_xflags.
Index: sys/fcntl.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/fcntl.h,v
retrieving revision 1.9.2.1
diff -u -r1.9.2.1 fcntl.h
--- sys/fcntl.h 2000/08/22 01:46:30 1.9.2.1
+++ sys/fcntl.h 2001/05/17 04:01:47
@@ -98,15 +98,18 @@
/* Defined by POSIX 1003.1; BSD default, but must be distinct from O_RDONLY. */
#define O_NOCTTY 0x8000 /* don't assign controlling terminal */
+/* Attempt to bypass buffer cache */
+#define O_DIRECT 0x00010000
+
#ifdef _KERNEL
/* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */
#define FFLAGS(oflags) ((oflags) + 1)
#define OFLAGS(fflags) ((fflags) - 1)
/* bits to save after open */
-#define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK)
+#define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK|O_DIRECT)
/* bits settable by fcntl(F_SETFL, ...) */
-#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM)
+#define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM|O_DIRECT)
#endif
/*
Index: sys/file.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/file.h,v
retrieving revision 1.22.2.5
diff -u -r1.22.2.5 file.h
--- sys/file.h 2001/02/26 04:23:21 1.22.2.5
+++ sys/file.h 2001/05/17 04:34:53
@@ -56,15 +56,14 @@
*/
struct file {
LIST_ENTRY(file) f_list;/* list of active files */
- short f_flag; /* see fcntl.h */
+ short f_FILLER3; /* (old f_flag) */
#define DTYPE_VNODE 1 /* file */
#define DTYPE_SOCKET 2 /* communications endpoint */
#define DTYPE_PIPE 3 /* pipe */
#define DTYPE_FIFO 4 /* fifo (named pipe) */
#define DTYPE_KQUEUE 5 /* event queue */
short f_type; /* descriptor type */
- short f_FILLER1; /* (OLD) reference count */
- short f_FILLER2; /* (OLD) references from message queue */
+ u_int f_flag; /* see fcntl.h */
struct ucred *f_cred; /* credentials associated with descriptor */
struct fileops {
int (*fo_read) __P((struct file *fp, struct uio *uio,
Index: sys/vnode.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/vnode.h,v
retrieving revision 1.111.2.4
diff -u -r1.111.2.4 vnode.h
--- sys/vnode.h 2000/12/30 01:51:10 1.111.2.4
+++ sys/vnode.h 2001/05/17 04:49:14
@@ -213,6 +213,7 @@
#define IO_VMIO 0x20 /* data already in VMIO space */
#define IO_INVAL 0x40 /* invalidate after I/O */
#define IO_ASYNC 0x80 /* bawrite rather then bdwrite */
+#define IO_DIRECT 0x100 /* attempt to bypass buffer cache */
/*
* Modes. Some values same as Ixxx entries from inode.h for now.
Index: ufs/ufs/ufs_readwrite.c
===================================================================
RCS file: /home/ncvs/src/sys/ufs/ufs/ufs_readwrite.c,v
retrieving revision 1.65.2.6
diff -u -r1.65.2.6 ufs_readwrite.c
--- ufs/ufs/ufs_readwrite.c 2000/12/30 01:51:11 1.65.2.6
+++ ufs/ufs/ufs_readwrite.c 2001/05/17 06:26:16
@@ -278,6 +278,15 @@
}
/*
+ * If IO_DIRECT then set B_DIRECT for the buffer. This
+ * will cause us to attempt to release the buffer later on
+ * and will cause the buffer cache to attempt to free the
+ * underlying pages.
+ */
+ if (ioflag & IO_DIRECT)
+ bp->b_flags |= B_DIRECT;
+
+ /*
* We should only get non-zero b_resid when an I/O error
* has occurred, which should cause us to break above.
* However, if the short read did not cause an error,
@@ -319,12 +328,12 @@
if (error)
break;
- if ((ioflag & IO_VMIO) &&
- (LIST_FIRST(&bp->b_dep) == NULL)) {
+ if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
+ (LIST_FIRST(&bp->b_dep) == NULL)) {
/*
- * If there are no dependencies, and
- * it's VMIO, then we don't need the buf,
- * mark it available for freeing. The VM has the data.
+ * If there are no dependencies, and it's VMIO,
+ * then we don't need the buf, mark it available
+ * for freeing. The VM has the data.
*/
bp->b_flags |= B_RELBUF;
brelse(bp);
@@ -346,8 +355,8 @@
* so it must have come from a 'break' statement
*/
if (bp != NULL) {
- if ((ioflag & IO_VMIO) &&
- (LIST_FIRST(&bp->b_dep) == NULL)) {
+ if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
+ (LIST_FIRST(&bp->b_dep) == NULL)) {
bp->b_flags |= B_RELBUF;
brelse(bp);
} else {
@@ -449,7 +458,7 @@
resid = uio->uio_resid;
osize = ip->i_size;
flags = 0;
- if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
+ if ((ioflag & (IO_SYNC|IO_DIRECT)) && !DOINGASYNC(vp))
flags = B_SYNC;
if (object && (object->flags & OBJ_OPT)) {
@@ -486,6 +495,8 @@
ap->a_cred, flags, &bp);
if (error != 0)
break;
+ if (ioflag & IO_DIRECT)
+ bp->b_flags |= B_DIRECT;
if (uio->uio_offset + xfersize > ip->i_size) {
ip->i_size = uio->uio_offset + xfersize;
@@ -498,11 +509,12 @@
error =
uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
- if ((ioflag & IO_VMIO) &&
- (LIST_FIRST(&bp->b_dep) == NULL))
+ if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
+ (LIST_FIRST(&bp->b_dep) == NULL)) {
bp->b_flags |= B_RELBUF;
+ }
- if (ioflag & IO_SYNC) {
+ if (ioflag & (IO_SYNC|IO_DIRECT)) {
(void)bwrite(bp);
} else if (vm_page_count_severe() ||
buf_dirty_count_severe() ||
Index: vm/vm_page.c
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_page.c,v
retrieving revision 1.147.2.6
diff -u -r1.147.2.6 vm_page.c
--- vm/vm_page.c 2001/03/03 23:06:09 1.147.2.6
+++ vm/vm_page.c 2001/05/17 04:22:38
@@ -1353,6 +1353,31 @@
}
/*
+ * vm_page_try_to_free()
+ *
+ * Attempt to free the page. If we cannot free it, we do nothing.
+ * 1 is returned on success, 0 on failure.
+ */
+
+int
+vm_page_try_to_free(m)
+ vm_page_t m;
+{
+ if (m->dirty || m->hold_count || m->busy || m->wire_count ||
+ (m->flags & (PG_BUSY|PG_UNMANAGED))) {
+ return(0);
+ }
+ vm_page_test_dirty(m);
+ if (m->dirty)
+ return(0);
+ vm_page_busy(m);
+ vm_page_protect(m, VM_PROT_NONE);
+ vm_page_free(m);
+ return(1);
+}
+
+
+/*
* vm_page_cache
*
* Put the specified page onto the page cache queue (if appropriate).
Index: vm/vm_page.h
===================================================================
RCS file: /home/ncvs/src/sys/vm/vm_page.h,v
retrieving revision 1.75.2.5
diff -u -r1.75.2.5 vm_page.h
--- vm/vm_page.h 2000/12/30 01:51:11 1.75.2.5
+++ vm/vm_page.h 2001/05/17 04:23:05
@@ -406,6 +406,7 @@
vm_page_t vm_page_grab __P((vm_object_t, vm_pindex_t, int));
void vm_page_cache __P((register vm_page_t));
int vm_page_try_to_cache __P((vm_page_t));
+int vm_page_try_to_free __P((vm_page_t));
void vm_page_dontneed __P((register vm_page_t));
static __inline void vm_page_copy __P((vm_page_t, vm_page_t));
static __inline void vm_page_free __P((vm_page_t));
To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-arch" in the body of the message
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200105170647.f4H6lkk88458>
