From owner-freebsd-arch@FreeBSD.ORG Sat Nov 12 18:29:06 2005 Return-Path: X-Original-To: freebsd-arch@freebsd.org Delivered-To: freebsd-arch@freebsd.org Received: from mx1.FreeBSD.org (mx1.freebsd.org [216.136.204.125]) by hub.freebsd.org (Postfix) with ESMTP id B08E816A41F; Sat, 12 Nov 2005 18:29:06 +0000 (GMT) (envelope-from rodrigc@c-24-147-19-135.hsd1.ma.comcast.net) Received: from rwcrmhc12.comcast.net (rwcrmhc12.comcast.net [204.127.198.43]) by mx1.FreeBSD.org (Postfix) with ESMTP id F336D43D45; Sat, 12 Nov 2005 18:29:05 +0000 (GMT) (envelope-from rodrigc@c-24-147-19-135.hsd1.ma.comcast.net) Received: from dibbler.crodrigues.org (c-24-147-19-135.hsd1.ma.comcast.net[24.147.19.135]) by comcast.net (rwcrmhc12) with ESMTP id <200511121829040140012ojke>; Sat, 12 Nov 2005 18:29:04 +0000 Received: from c-24-147-19-135.hsd1.ma.comcast.net (localhost.127.in-addr.arpa [127.0.0.1]) by dibbler.crodrigues.org (8.13.4/8.13.1) with ESMTP id jACITAlP012418; Sat, 12 Nov 2005 13:29:10 -0500 (EST) (envelope-from rodrigc@c-24-147-19-135.hsd1.ma.comcast.net) Received: (from rodrigc@localhost) by c-24-147-19-135.hsd1.ma.comcast.net (8.13.4/8.13.1/Submit) id jACIT9oY012417; Sat, 12 Nov 2005 13:29:09 -0500 (EST) (envelope-from rodrigc) Date: Sat, 12 Nov 2005 13:29:09 -0500 From: Craig Rodrigues To: freebsd-arch@freebsd.org Message-ID: <20051112182909.GA4301@crodrigues.org> Mime-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline User-Agent: Mutt/1.4.2.1i Cc: kan@freebsd.org, jeff@freebsd.org Subject: [RFC] vfs_bio additions, motivated by XFS for FreeBSD project X-BeenThere: freebsd-arch@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: Discussion related to FreeBSD architecture List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sat, 12 Nov 2005 18:29:06 -0000 Hi, Now that FreeBSD 6.0 is released, I would like to work on integrating code from the XFS for FreeBSD project into FreeBSD-CURRENT. Alexander Kabaev made some changes to vfs_bio.c which are needed by the XFS for FreeBSD code. In addition to some new functions, this patch adds three new fields to struct buf (b_fsprivate1, b_fsprivate2, b_fsprivate3). You don't see their use here, but in the XFS for FreeBSD code (which you can get from http://people.freebsd.org/~rodrigc/xfs/ ), they are used to cache certain information. Comments? --- //depot/vendor/freebsd/src/sys/kern/vfs_bio.c 2005/10/08 15:01:11 +++ //depot/projects/src/sys/kern/vfs_bio.c 2005/10/08 16:09:54 @@ -216,7 +216,7 @@ */ static struct mtx rbreqlock; -/* +/* * Synchronization (sleep/wakeup) variable for buffer requests. * Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done * by and/or. @@ -233,8 +233,12 @@ /* * Lock that protects against bwait()/bdone()/B_DONE races. */ +static struct mtx bdonelock; -static struct mtx bdonelock; +/* + * Lock that protects against bwait()/bdone()/B_DONE races. + */ +static struct mtx bpinlock; /* * Definitions for the buffer free lists. @@ -523,6 +527,7 @@ mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF); mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF); mtx_init(&bdonelock, "bdone lock", NULL, MTX_DEF); + mtx_init(&bpinlock, "bpin lock", NULL, MTX_DEF); /* next, make a null set of free lists */ for (i = 0; i < BUFFER_QUEUES; i++) @@ -636,7 +641,7 @@ * bremfree: * * Mark the buffer for removal from the appropriate free list in brelse. - * + * */ void bremfree(struct buf *bp) @@ -720,18 +725,51 @@ } /* + * Attempt to initiate asynchronous I/O on read-ahead blocks. We must + * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set, + * the buffer is valid and we do not have to do anything. + */ +void +breada(struct vnode * vp, daddr_t * rablkno, int * rabsize, + int cnt, struct ucred * cred) +{ + struct buf *rabp; + int i; + + for (i = 0; i < cnt; i++, rablkno++, rabsize++) { + if (inmem(vp, *rablkno)) + continue; + rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); + + if ((rabp->b_flags & B_CACHE) == 0) { + if (curthread != PCPU_GET(idlethread)) + curthread->td_proc->p_stats->p_ru.ru_inblock++; + rabp->b_flags |= B_ASYNC; + rabp->b_flags &= ~B_INVAL; + rabp->b_ioflags &= ~BIO_ERROR; + rabp->b_iocmd = BIO_READ; + if (rabp->b_rcred == NOCRED && cred != NOCRED) + rabp->b_rcred = crhold(cred); + vfs_busy_pages(rabp, 0); + BUF_KERNPROC(rabp); + rabp->b_iooffset = dbtob(rabp->b_blkno); + bstrategy(rabp); + } else { + brelse(rabp); + } + } +} + +/* * Operates like bread, but also starts asynchronous I/O on - * read-ahead blocks. We must clear BIO_ERROR and B_INVAL prior - * to initiating I/O . If B_CACHE is set, the buffer is valid - * and we do not have to do anything. + * read-ahead blocks. */ int breadn(struct vnode * vp, daddr_t blkno, int size, daddr_t * rablkno, int *rabsize, int cnt, struct ucred * cred, struct buf **bpp) { - struct buf *bp, *rabp; - int i; + struct buf *bp; int rv = 0, readwait = 0; CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size); @@ -752,29 +790,8 @@ ++readwait; } - for (i = 0; i < cnt; i++, rablkno++, rabsize++) { - if (inmem(vp, *rablkno)) - continue; - rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0); + breada(vp, rablkno, rabsize, cnt, cred); - if ((rabp->b_flags & B_CACHE) == 0) { - if (curthread != PCPU_GET(idlethread)) - curthread->td_proc->p_stats->p_ru.ru_inblock++; - rabp->b_flags |= B_ASYNC; - rabp->b_flags &= ~B_INVAL; - rabp->b_ioflags &= ~BIO_ERROR; - rabp->b_iocmd = BIO_READ; - if (rabp->b_rcred == NOCRED && cred != NOCRED) - rabp->b_rcred = crhold(cred); - vfs_busy_pages(rabp, 0); - BUF_KERNPROC(rabp); - rabp->b_iooffset = dbtob(rabp->b_blkno); - bstrategy(rabp); - } else { - brelse(rabp); - } - } - if (readwait) { rv = bufwait(bp); } @@ -807,6 +824,10 @@ if (BUF_REFCNT(bp) == 0) panic("bufwrite: buffer is not busy???"); + + if (bp->b_pin_count > 0) + bunpin_wait(bp); + KASSERT(!(bp->b_vflags & BV_BKGRDINPROG), ("FFS background buffer should not get here %p", bp)); @@ -1117,6 +1138,11 @@ KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); + if (bp->b_flags & B_MANAGED) { + bqrelse(bp); + return; + } + if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) && !(bp->b_flags & B_INVAL)) { @@ -1286,7 +1312,7 @@ } } - + if (BUF_REFCNT(bp) > 1) { /* do not release to free list */ BUF_UNLOCK(bp); @@ -1394,6 +1420,18 @@ BUF_UNLOCK(bp); return; } + + if (bp->b_flags & B_MANAGED) { + if (bp->b_flags & B_REMFREE) { + mtx_lock(&bqlock); + bremfreel(bp); + mtx_unlock(&bqlock); + } + bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); + BUF_UNLOCK(bp); + return; + } + mtx_lock(&bqlock); /* Handle delayed bremfree() processing. */ if (bp->b_flags & B_REMFREE) @@ -1821,6 +1859,10 @@ bp->b_npages = 0; bp->b_dirtyoff = bp->b_dirtyend = 0; bp->b_bufobj = NULL; + bp->b_pin_count = 0; + bp->b_fsprivate1 = NULL; + bp->b_fsprivate2 = NULL; + bp->b_fsprivate3 = NULL; LIST_INIT(&bp->b_dep); @@ -2059,6 +2101,10 @@ if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) continue; + if (bp->b_pin_count > 0) { + BUF_UNLOCK(bp); + continue; + } BO_LOCK(bp->b_bufobj); if ((bp->b_vflags & BV_BKGRDINPROG) != 0 || (bp->b_flags & B_DELWRI) == 0) { @@ -2393,6 +2439,19 @@ if ((bp->b_flags & B_VMIO) == 0 || (size > bp->b_kvasize)) { if (bp->b_flags & B_DELWRI) { + /* + * If buffer is pinned and caller does + * not want sleep waiting for it to be + * unpinned, bail out + * */ + if (bp->b_pin_count > 0) { + if (flags & GB_LOCK_NOWAIT) { + bqrelse(bp); + return (NULL); + } else { + bunpin_wait(bp); + } + } bp->b_flags |= B_NOCACHE; bwrite(bp); } else { @@ -3034,11 +3093,11 @@ struct bufobj *dropobj; void (*biodone)(struct buf *); - CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags); dropobj = NULL; - KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp))); + KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, + BUF_REFCNT(bp))); KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp)); runningbufwakeup(bp); @@ -3053,6 +3112,19 @@ bufobj_wdrop(dropobj); return; } + + bufdone_finish(bp); + + if (dropobj) + bufobj_wdrop(dropobj); +} + +void +bufdone_finish(struct buf *bp) +{ + KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, + BUF_REFCNT(bp))); + if (LIST_FIRST(&bp->b_dep) != NULL) buf_complete(bp); @@ -3118,7 +3190,8 @@ if (m == NULL) panic("biodone: page disappeared!"); bp->b_pages[i] = m; - pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); + pmap_qenter(trunc_page((vm_offset_t)bp->b_data), + bp->b_pages, bp->b_npages); } #if defined(VFS_BIO_DEBUG) if (OFF_TO_IDX(foff) != m->pindex) { @@ -3130,7 +3203,7 @@ /* * In the write case, the valid and clean bits are - * already changed correctly ( see bdwrite() ), so we + * already changed correctly ( see bdwrite() ), so we * only need to do this here in the read case. */ if ((bp->b_iocmd == BIO_READ) && !bogusflag && resid > 0) { @@ -3185,8 +3258,6 @@ bqrelse(bp); } else bdone(bp); - if (dropobj) - bufobj_wdrop(dropobj); } /* @@ -3742,6 +3813,32 @@ return (error); } +void +bpin(struct buf *bp) +{ + mtx_lock(&bpinlock); + bp->b_pin_count ++; + mtx_unlock(&bpinlock); +} + +void +bunpin(struct buf *bp) +{ + mtx_lock(&bpinlock); + if ( --bp->b_pin_count == 0) + wakeup(bp); + mtx_unlock(&bpinlock); +} + +void +bunpin_wait(struct buf *bp) +{ + mtx_lock(&bpinlock); + while (bp->b_pin_count > 0) + msleep(bp, &bpinlock, PRIBIO, "bwunpin", 0); + mtx_unlock(&bpinlock); +} + #include "opt_ddb.h" #ifdef DDB #include @@ -3794,3 +3891,4 @@ } } #endif /* DDB */ + --- //depot/vendor/freebsd/src/sys/kern/vfs_cluster.c 2005/08/14 09:53:08 +++ //depot/projects/src/sys/kern/vfs_cluster.c 2005/08/14 10:01:58 @@ -765,6 +765,12 @@ --len; continue; } + if (tbp->b_pin_count > 0) { + BUF_UNLOCK(tbp); + ++start_lbn; + --len; + continue; + } bremfree(tbp); tbp->b_flags &= ~B_DONE; @@ -868,6 +874,15 @@ BUF_UNLOCK(tbp); break; } + + /* + * Do not pull in pinned buffers. + */ + if (tbp->b_pin_count > 0) { + BUF_UNLOCK(tbp); + break; + } + /* * Ok, it's passed all the tests, * so remove it from the free list @@ -979,3 +994,4 @@ buflist->bs_nchildren = i + 1; return (buflist); } + --- //depot/vendor/freebsd/src/sys/sys/buf.h 2005/10/08 15:01:11 +++ //depot/projects/src/sys/sys/buf.h 2005/10/08 16:09:54 @@ -135,6 +135,10 @@ struct vm_page *b_pages[btoc(MAXPHYS)]; int b_npages; struct workhead b_dep; /* (D) List of filesystem dependencies. */ + void *b_fsprivate1; + void *b_fsprivate2; + void *b_fsprivate3; + int b_pin_count; }; #define b_object b_bufobj->bo_object @@ -214,7 +218,7 @@ #define B_01000000 0x01000000 /* Available flag. */ #define B_02000000 0x02000000 /* Available flag. */ #define B_PAGING 0x04000000 /* volatile paging I/O -- bypass VMIO */ -#define B_08000000 0x08000000 /* Available flag. */ +#define B_MANAGED 0x08000000 /* Managed by FS. */ #define B_RAM 0x10000000 /* Read ahead mark (flag) */ #define B_VMIO 0x20000000 /* VMIO flag */ #define B_CLUSTER 0x40000000 /* pagein op, so swap() can count it */ @@ -486,6 +490,7 @@ void bremfree(struct buf *); void bremfreef(struct buf *); /* XXX Force bremfree, only for nfs. */ int bread(struct vnode *, daddr_t, int, struct ucred *, struct buf **); +void breada(struct vnode *, daddr_t *, int *, int, struct ucred *); int breadn(struct vnode *, daddr_t, int, daddr_t *, int *, int, struct ucred *, struct buf **); void bdwrite(struct buf *); @@ -504,6 +509,7 @@ int bufwait(struct buf *); int bufwrite(struct buf *); void bufdone(struct buf *); +void bufdone_finish(struct buf *); int cluster_read(struct vnode *, u_quad_t, daddr_t, long, struct ucred *, long, int, struct buf **); @@ -527,7 +533,11 @@ struct buf *trypbuf(int *); void bwait(struct buf *, u_char, const char *); void bdone(struct buf *); +void bpin(struct buf *); +void bunpin(struct buf *); +void bunpin_wait(struct buf *); #endif /* _KERNEL */ #endif /* !_SYS_BUF_H_ */ + -- Craig Rodrigues rodrigc@crodrigues.org