Date: Sat, 12 Dec 2009 03:35:49 +0000 (UTC) From: Kip Macy <kmacy@FreeBSD.org> To: src-committers@freebsd.org, svn-src-user@freebsd.org Subject: svn commit: r200428 - in user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs: . sys Message-ID: <200912120335.nBC3Zn0Z029042@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: kmacy Date: Sat Dec 12 03:35:49 2009 New Revision: 200428 URL: http://svn.freebsd.org/changeset/base/200428 Log: checkpoint mostly complety state of ARC / VM integration Modified: user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_bio.h user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_bio.c user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c Modified: user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_bio.h ============================================================================== --- user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_bio.h Sat Dec 12 02:34:00 2009 (r200427) +++ user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_bio.h Sat Dec 12 03:35:49 2009 (r200428) @@ -34,7 +34,7 @@ $FreeBSD$ #define ZBIO_BUF_CLONING (1 << 30) /* is being cloned */ -void zbio_sync_cache(spa_t *spa, blkptr_t *bp, uint64_t txg, uint64_t size); +void zbio_sync_cache(spa_t *spa, blkptr_t *bp, uint64_t txg, void *data, uint64_t size, int bio_op); void zbio_getblk(arc_buf_t *buf); void zbio_data_getblk(arc_buf_t *buf); void zbio_relse(arc_buf_t *buf, size_t size); Modified: user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_bio.c ============================================================================== --- user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_bio.c Sat Dec 12 02:34:00 2009 (r200427) +++ user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_bio.c Sat Dec 12 03:35:49 2009 (r200428) @@ -27,6 +27,57 @@ POSSIBILITY OF SUCH DAMAGE. ***************************************************************************/ +/************************************************************************** +This module integrates the caching af pages associated with ARC buffers in a +per-SPA vm object. Each SPA also has an associated "zbio_state_t" which +tracks bufs allocated for the SPA in two splay trees. + +The first splay tree tracks bufs by the data pointer's virtual address. +It is used for malloc'ed buffers, and buffers that are VMIO but do not have +any pages in the SPA's vm object(s). + +Buffers are malloced if: + 1) the size is not a multiple of PAGE_SIZE + 2) the buffer is cloned + +There are two reasons why a VMIO buf would not have any pages in the vm object: + 1) the buffer has not yet been assigned an address on disk (and thus + has no offset in the vm object) + 2) the buffer did have pages in the vm object, but they were evicted + and replaced by a newer + +The second splay tree tracks buffers by block address and is only used +to track buffers whose pages are referenced by the vm object. It is used to +ensure that buffers that belong to an older transaction group don't have their +pages mapped by buffers belonging to a newer transaction group. + +zfs_bio assumes that buffers that are cloned and buffers whose pages +are evicted from the vm object are not used for I/O (will not be referenced +from zfs_bio_sync_cache). + +Pages in the vm object are marked valid on completion of a read or before the +initiation of a write. + + + +There are two places where we synchronize the ARC with the vm object's +page cache: getblk and sync_cache. + +In getblk for a malloced buffer we check if the page at the corresponding offset +is valid, if it is map it in and copy it in to the new buffer. For a VMIO buffer +we need to remove the pages for any existing overlapping buffers and free any +other pages in the vm object. + +In sync_cache for a malloced buffer we need to evict pages belonging to overlapping +VMIO buffers, then copy to/from any pages still in the vm object. For an unmapped +VMIO buffer, we need to remove pages belonging to any existing buffers and free +any remaining overlapping pages in the vm object. We then add the VMIO buffers +pages to a VM object. If the buffer is already mapped we mark the pages valid on a +write, on a read we set a flag in the zio and mark the pages valid before calling +the io_done I/O completion function. + + +**************************************************************************/ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); @@ -42,6 +93,7 @@ __FBSDID("$FreeBSD$"); #include <sys/kstat.h> #include <sys/sdt.h> +#include <sys/bitstring.h> #include <vm/vm_pageout.h> #ifdef _KERNEL @@ -58,180 +110,734 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, page_cach &zfs_page_cache_disable, 0, "Disable backing ARC with page cache "); static eventhandler_tag zbio_event_shutdown = NULL; - +struct zbio_state; +typedef struct zbio_state zbio_state_t; +typedef struct buf buf_t; +typedef uint64_t zbio_pindex_t; + +MALLOC_DEFINE(M_ZFS_BIO, "zfs_bio", "zfs buffer cache / vm"); + +#define B_EVICTED B_00000800 +#define B_CLONED B_00001000 +#define B_ASSIGNED B_00004000 + +#define ZB_EVICT_ALL 0x1 + +#define btos(nbytes) ((nbytes)>>DEV_BSHIFT) +#define stob(nsectors) ((nsectors)<<DEV_BSHIFT) + +#define b_arc_buf b_fsprivate2 +#define b_state b_fsprivate3 + +struct zbio_state { + struct mtx mtx; + buf_t *blkno_root; /* track buf by blkno */ + buf_t *va_root; /* track buf by data address */ + spa_t *spa; + int generation; + int resident_count; + TAILQ_HEAD(, buf) blkno_memq; /* list of resident buffers */ + TAILQ_HEAD(, buf) va_memq; /* list of resident buffers */ +}; + +#define ZBIO_STATE_LOCK(zs) mtx_lock(&(zs)->mtx) +#define ZBIO_STATE_UNLOCK(zs) mtx_unlock(&(zs)->mtx) + +#define spa_get_bio_state(spa) ((zbio_state_t *)spa_get_vnode((spa))->v_data) +#define spa_get_vm_object(spa) spa_get_vnode((spa))->v_object +#define zbio_buf_get_spa(bp) (((zbio_buf_hdr_t *)((arc_buf_t *)(bp->b_arc_buf))->b_hdr)->b_spa) + +static void zbio_buf_blkno_remove(buf_t *bp); +static void zbio_buf_va_insert(buf_t *bp, zbio_state_t *object); + +/* + * zbio_buf_blkno_splay: [ internal use only ] + * + * Implements Sleator and Tarjan's top-down splay algorithm. Returns + * the buf containing the given lblkno. If, however, that + * lblkno is not found in the tree, returns a buf that is + * adjacent to the pindex, coming before or after it. + */ +static buf_t * +zbio_buf_blkno_splay(daddr_t blkno, buf_t *root) +{ + buf_t dummy; + buf_t *lefttreemax, *righttreemin, *y; + + if (root == NULL) + return (root); + lefttreemax = righttreemin = &dummy; + for (;; root = y) { + if (blkno < root->b_blkno) { + if ((y = root->b_left) == NULL) + break; + if (blkno < y->b_blkno) { + /* Rotate right. */ + root->b_left = y->b_right; + y->b_right = root; + root = y; + if ((y = root->b_left) == NULL) + break; + } + /* Link into the new root's right tree. */ + righttreemin->b_left = root; + righttreemin = root; + } else if (blkno > root->b_blkno) { + if ((y = root->b_right) == NULL) + break; + if (blkno > y->b_blkno) { + /* Rotate left. */ + root->b_right = y->b_left; + y->b_left = root; + root = y; + if ((y = root->b_right) == NULL) + break; + } + /* Link into the new root's left tree. */ + lefttreemax->b_right = root; + lefttreemax = root; + } else + break; + } + /* Assemble the new root. */ + lefttreemax->b_right = root->b_left; + righttreemin->b_left = root->b_right; + root->b_left = dummy.b_right; + root->b_right = dummy.b_left; + return (root); +} + +static buf_t * +zbio_buf_va_splay(caddr_t va, buf_t *root) +{ + buf_t dummy; + buf_t *lefttreemax, *righttreemin, *y; + + if (root == NULL) + return (root); + lefttreemax = righttreemin = &dummy; + for (;; root = y) { + if (va < root->b_data) { + if ((y = root->b_left) == NULL) + break; + if (va < y->b_data) { + /* Rotate right. */ + root->b_left = y->b_right; + y->b_right = root; + root = y; + if ((y = root->b_left) == NULL) + break; + } + /* Link into the new root's right tree. */ + righttreemin->b_left = root; + righttreemin = root; + } else if (va > root->b_data) { + if ((y = root->b_right) == NULL) + break; + if (va > y->b_data) { + /* Rotate left. */ + root->b_right = y->b_left; + y->b_left = root; + root = y; + if ((y = root->b_right) == NULL) + break; + } + /* Link into the new root's left tree. */ + lefttreemax->b_right = root; + lefttreemax = root; + } else + break; + } + /* Assemble the new root. */ + lefttreemax->b_right = root->b_left; + righttreemin->b_left = root->b_right; + root->b_left = dummy.b_right; + root->b_right = dummy.b_left; + return (root); +} + +/* + * zbio_buf_blkno_insert: [ internal use only ] + * + * Inserts the given buf into the state splay tree and state list. + * + * The object and page must be locked. + * This routine may not block. + */ static void -_zbio_getblk(arc_buf_t *buf, int flags) +zbio_buf_blkno_insert(buf_t *bp, zbio_state_t *object) { - zbio_buf_hdr_t *hdr = (zbio_buf_hdr_t *)buf->b_hdr; - uint64_t size = hdr->b_size; - spa_t *spa = hdr->b_spa; - uint64_t blkno = hdr->b_dva.dva_word[1] & ~(1ULL<<63); - void *data; - struct vnode *vp; - struct buf *newbp; - struct bufobj *bo; - - vp = spa_get_vnode(spa); - bo = &vp->v_bufobj; - newbp = NULL; - if ((size < PAGE_SIZE) || (hdr->b_flags & ZBIO_BUF_CLONING) || - zfs_page_cache_disable) { - data = zio_buf_alloc(size); - hdr->b_flags &= ~ZBIO_BUF_CLONING; - } else if (BUF_EMPTY(hdr)) { - newbp = geteblk(size, flags); - data = newbp->b_data; + buf_t *root; + daddr_t root_blkno_end, blkno, blkno_end; + + blkno = bp->b_blkno; + blkno_end = bp->b_blkno + btos(bp->b_bcount); + + root = object->blkno_root; + if (root == NULL) { + bp->b_left = NULL; + bp->b_right = NULL; + TAILQ_INSERT_TAIL(&object->blkno_memq, bp, b_bobufs); } else { - newbp = getblk(vp, blkno, size, 0, 0, flags | GB_LOCK_NOWAIT); - if (newbp == NULL) - newbp = geteblk(size, flags); - else - brelvp(newbp); - data = newbp->b_data; - } + root = zbio_buf_blkno_splay(bp->b_blkno, root); + root_blkno_end = root->b_blkno + btos(root->b_bcount); - if (newbp != NULL) { - BUF_KERNPROC(newbp); - newbp->b_bufobj = bo; - CTR4(KTR_SPARE2, "arc_getblk() bp=%p flags %X " - "blkno %ld npages %d", - newbp, newbp->b_flags, blkno, newbp->b_npages); + if (blkno < root->b_blkno) { + KASSERT(blkno_end <= root->b_blkno, ("buffer overlap!")); + bp->b_left = root->b_left; + bp->b_right = root; + root->b_left = NULL; + TAILQ_INSERT_BEFORE(root, bp, b_bobufs); + } else if (blkno == root->b_blkno) { + panic("zbio_buf_blkno_insert: blkno already allocated"); + } else { + KASSERT(root_blkno_end <= blkno, ("buffer overlap!")); + + bp->b_right = root->b_right; + bp->b_left = root; + root->b_right = NULL; + TAILQ_INSERT_AFTER(&object->blkno_memq, root, bp, b_bobufs); + } } + object->blkno_root = bp; + object->generation++; - buf->b_bp = newbp; - buf->b_data = data; + /* + * show that the object has one more resident buffer. + */ + object->resident_count++; } -void -zbio_getblk(arc_buf_t *buf) +/* + * zbio_buf_insert: [ internal use only ] + * + * Inserts the given buf into the state splay tree and state list. + * + * The object and page must be locked. + * This routine may not block. + */ +static void +zbio_buf_va_insert(buf_t *bp, zbio_state_t *object) { + buf_t *root; + caddr_t va = bp->b_data; - _zbio_getblk(buf, 0); + bp->b_state = object; + root = object->va_root; + if (root == NULL) { + bp->b_left = NULL; + bp->b_right = NULL; + TAILQ_INSERT_TAIL(&object->va_memq, bp, b_bobufs); + } else { + root = zbio_buf_va_splay(bp->b_data, root); + if (va < root->b_data) { + bp->b_left = root->b_left; + bp->b_right = root; + root->b_left = NULL; + TAILQ_INSERT_BEFORE(root, bp, b_bobufs); + } else if (va == root->b_data) { + panic("zbio_buf_va_insert: address already allocated"); + } else { + bp->b_right = root->b_right; + bp->b_left = root; + root->b_right = NULL; + TAILQ_INSERT_AFTER(&object->va_memq, root, bp, b_bobufs); + } + } + object->va_root = bp; + object->generation++; + + /* + * show that the object has one more resident buffer. + */ + object->resident_count++; } -void -zbio_data_getblk(arc_buf_t *buf) +/* + * zbio_buf_remove: + * + * Removes the given buf from the spa's state tree + * buf list + * + * The state and buf must be locked. + * This routine may not block. + */ +static void +zbio_buf_blkno_remove(buf_t *bp) { + zbio_state_t *state; + buf_t *root; + daddr_t blkno, blkno_end; - _zbio_getblk(buf, GB_NODUMP); + if ((state = bp->b_state) == NULL) + return; + + /* + * Now remove from the object's list of backed pages. + */ + if (bp != state->blkno_root) + zbio_buf_blkno_splay(bp->b_blkno, state->blkno_root); + if (bp->b_left == NULL) + root = bp->b_right; + else { + root = zbio_buf_blkno_splay(bp->b_blkno, bp->b_left); + root->b_right = bp->b_right; + } + state->blkno_root = root; + TAILQ_REMOVE(&state->blkno_memq, bp, b_bobufs); + + /* + * And show that the object has one fewer resident page. + */ + state->resident_count--; + state->generation++; } -void -zbio_relse(arc_buf_t *buf, size_t size) +/* + * zbio_buf_va_remove: + * + * Removes the given buf from the spa's state tree + * buf list + * + * The state and buf must be locked. + * This routine may not block. + */ +static void +zbio_buf_va_remove(buf_t *bp) { - struct buf *bp = buf->b_bp; - void * data = buf->b_data; + zbio_state_t *state; + buf_t *root; + vm_offset_t va; - if (bp == NULL) { - zio_buf_free(data, size); + if ((state = bp->b_state) == NULL) return; + + /* + * Now remove from the object's list of backed pages. + */ + if (bp != state->va_root) + zbio_buf_va_splay(bp->b_data, state->va_root); + if (bp->b_left == NULL) + root = bp->b_right; + else { + root = zbio_buf_va_splay(bp->b_data, bp->b_left); + root->b_right = bp->b_right; } + state->va_root = root; + TAILQ_REMOVE(&state->va_memq, bp, b_bobufs); - CTR4(KTR_SPARE2, "arc_brelse() bp=%p flags %X" - " size %ld blkno=%ld", - bp, bp->b_flags, size, bp->b_blkno); + /* + * And show that the object has one fewer resident page. + */ + state->resident_count--; + state->generation++; +} - bp->b_flags |= B_ZFS; - brelse(bp); +/* + * zbio_buf_va_lookup: + * + * Returns the range associated with the object/offset + * pair specified; if none is found, NULL is returned. + * + * The object must be locked. + * This routine may not block. + * This is a critical path routine + */ +static buf_t * +zbio_buf_va_lookup(zbio_state_t *state, caddr_t va) +{ + buf_t *bp; + + if ((bp = state->va_root) != NULL && bp->b_data != va) { + bp = zbio_buf_va_splay(va, bp); + if ((state->va_root = bp)->b_data != va) + bp = NULL; + } + return (bp); } -void -zbio_sync_cache(spa_t *spa, blkptr_t *bp, uint64_t txg, uint64_t size) + +/* + * zbio_buf_blkno_lookup: + * + * Returns the range associated with the object/offset + * pair specified; if none is found, NULL is returned. + * + * The object must be locked. + * This routine may not block. + * This is a critical path routine + */ +static buf_t * +zbio_buf_blkno_lookup(zbio_state_t *state, daddr_t blkno) +{ + buf_t *bp; + + if ((bp = state->blkno_root) != NULL && bp->b_blkno != blkno) { + bp = zbio_buf_blkno_splay(blkno, bp); + if ((state->blkno_root = bp)->b_blkno != blkno) + bp = NULL; + } + return (bp); +} + +static void +zbio_buf_vm_object_copyin(buf_t *bp) { -#ifdef notyet - uint64_t blkno, blkno_lookup; - struct vnode *vp; - struct bufobj *bo; - struct buf *bp; - vm_pindex_t start, end; - vm_object_t object; - vm_page_t m; - int i; - if (zfs_page_cache_disable) - return; - blkno_lookup = blkno = dva->dva_word[1] & ~(1ULL<<63); - vp = spa_get_vnode(spa); - bo = &vp->v_bufobj; + +} - if (dva == NULL || spa == NULL || blkno == 0 || size == 0) - return; +static void +zbio_buf_vm_object_copyout(buf_t *bp) +{ - start = OFF_TO_IDX((blkno_lookup << 9)); - end = start + OFF_TO_IDX(size + PAGE_MASK); - object = vp->v_object; + +} - VM_OBJECT_LOCK(object); - vm_page_cache_free(object, start, end); - vm_object_page_remove(object, start, end, FALSE); -#ifdef INVARIANTS - for (i = 0; i < OFF_TO_IDX(size); i++) { - KASSERT(vm_page_lookup(object, start + i) == NULL, - ("found page at %ld blkno %ld blkno_lookup %ld", - start + i, blkno, blkno_lookup)); - } -#endif - VM_OBJECT_UNLOCK(object); -#endif +static void +zbio_buf_vm_object_evict(buf_t *bp) +{ + int i; + + /* + * remove pages from backing vm_object + */ + for (i = 0; i < bp->b_npages; i++) + vm_page_remove(bp->b_pages[i]); } -#if 0 static void -arc_pcache(struct vnode *vp, struct buf *bp, uint64_t blkno) +zbio_buf_vm_object_insert(buf_t *bp, int valid) { - vm_pindex_t start = OFF_TO_IDX((blkno << 9)); - vm_object_t object = vp->v_object; - struct bufobj *bo = &vp->v_bufobj; vm_page_t m; + vm_pindex_t start = OFF_TO_IDX(stob(bp->b_blkno)); + spa_t *spa = zbio_buf_get_spa(bp); + struct vnode *vp = spa_get_vnode(spa); + struct vm_object *object = vp->v_object; int i; - CTR3(KTR_SPARE2, "arc_pcache() bp=%p blkno %ld npages %d", - bp, blkno, bp->b_npages); VM_OBJECT_LOCK(object); - vm_page_lock_queues(); + /* + * Insert buffer pages in the object + */ for (i = 0; i < bp->b_npages; i++) { m = bp->b_pages[i]; - m->valid = VM_PAGE_BITS_ALL; + if (valid) + m->valid = VM_PAGE_BITS_ALL; vm_page_insert(m, object, start + i); m->flags &= ~PG_UNMANAGED; - vm_page_enqueue(PQ_INACTIVE, m); vdrop(vp); } + vm_page_lock_queues(); + for (i = 0; i < bp->b_npages; i++) { + m = bp->b_pages[i]; + vm_page_enqueue(PQ_INACTIVE, m); + } vm_page_unlock_queues(); VM_OBJECT_UNLOCK(object); - bp->b_bufobj = bo; - bp->b_flags |= B_VMIO; + } +/* + * zbio_buf_evict_overlap: [ internal use only ] + * + * Evict the pages of any buffers overlapping with this range + * + * If ZB_EVICT_ALL is passed then evict all the pages in that range + * from the vm object + * + * The object and page must be locked. + * This routine may not block. + */ static void -arc_bcache(arc_buf_t *buf) -{ - uint64_t blkno = buf->b_hdr->b_dva.dva_word[1] & ~(1ULL<<63); - struct buf *bp; - struct vnode *vp = spa_get_vnode(buf->b_hdr->b_spa); - arc_buf_hdr_t *hdr = buf->b_hdr; - int cachebuf; +zbio_buf_blkno_evict_overlap(daddr_t blkno, int size, zbio_state_t *state, + uint64_t txg, int evict_op, int locked) +{ + buf_t *root, *tmpbp; + daddr_t blkno_end, tmpblkno, tmpblkno_end; + struct cluster_list_head clh; + int i, collisions; + uint64_t tmptxg; + vm_pindex_t start, end; + vm_object_t object = spa_get_vm_object(state->spa); - if (zfs_page_cache_disable) + if (!locked) + VM_OBJECT_LOCK(object); + if ((root = state->blkno_root) == NULL) + goto done; + + collisions = 0; + root = zbio_buf_blkno_splay(blkno, root); + TAILQ_INIT(&clh); + if (blkno < root->b_blkno) + tmpbp = TAILQ_PREV(root, cluster_list_head, b_bobufs); + + /* + * Find all existing buffers that overlap with this range + */ + tmpbp = tmpbp != NULL ? tmpbp : root; + while (tmpbp != NULL && tmpbp->b_blkno < blkno_end) { + tmpblkno = tmpbp->b_blkno; + tmpblkno_end = tmpblkno + btos(tmpbp->b_bcount); + tmptxg = ((zbio_buf_hdr_t *)((arc_buf_t *)tmpbp->b_arc_buf)->b_hdr)->b_birth; + + if (((tmpblkno >= blkno) && (tmpblkno < blkno_end)) || + (tmpblkno_end > blkno) && (tmpblkno_end <= blkno_end) && + ((txg == 0) || (tmptxg < txg))) { + TAILQ_INSERT_TAIL(&clh, tmpbp, b_freelist); + collisions++; + } + tmpbp = TAILQ_NEXT(tmpbp, b_bobufs); + } + while (!TAILQ_EMPTY(&clh)) { + tmpbp = TAILQ_FIRST(&clh); + TAILQ_REMOVE(&clh, tmpbp, b_freelist); + zbio_buf_vm_object_evict(tmpbp); + + KASSERT(tmpbp->b_flags & B_EVICTED == 0, + ("buffer has already been evicted")); + tmpbp->b_flags |= B_EVICTED; + state->blkno_root = tmpbp; + /* + * move buffer to the unmanaged tree + */ + zbio_buf_blkno_remove(tmpbp); + zbio_buf_va_insert(tmpbp, state); + } +done: + if (!(collisions == 1 && tmpbp->b_blkno == blkno && tmpbp->b_bcount == size) + && (evict_op == ZB_EVICT_ALL)) { + start = OFF_TO_IDX(stob(blkno)); + end = start + OFF_TO_IDX(size); + vm_page_cache_free(object, start, end); + vm_object_page_remove(object, start, end, FALSE); +#ifdef INVARIANTS + for (i = 0; i < OFF_TO_IDX(size); i++) { + KASSERT(vm_page_lookup(object, start + i) == NULL, + ("found page at %ld blkno %ld ",start + i, blkno)); + } +#endif + } + if (!locked) + VM_OBJECT_UNLOCK(object); +} + +/* +Cases: + +A) B_MALLOC / address is known + 1) getblk: + a) page cached: copyin + mark B_CACHE + b) buffer+page cached: copyin + mark B_CACHE + c) default: N/A + 2) sync_cache: + a) page cached: copy{in, out} + b) buffer+page cached: evict overlapping pages + c) default: N/A +B) B_MALLOC / address is !known + 1) getblk: N/A + 2) sync_cache: + a) page cached: copy{in, out} + b) buffer+page cached: evict overlapping pages + c) default: N/A + +C) !B_MALLOC / address is !known + 2) sync_cache: + a) page cached: evict/free old pages + replace + b) buffer+page cached: evict overlapping pages from object + replace + c) default: add pages to vm object + +D) !B_MALLOC / address is known + 1) getblk: + a) buffer+page cached: evict pages belonging to older buffer + b) default: N/A + 2) sync_cache: N/A - we should only be doing I/O on valid B_VMIO buffers + +*/ + +static buf_t * +_zbio_getblk_malloc(zbio_buf_hdr_t *hdr, int flags) +{ + buf_t *newbp, *tmpbp; + void *data; + daddr_t blkno; + uint64_t size = hdr->b_size; + uint64_t txg = hdr->b_birth; + zbio_state_t *state = spa_get_bio_state(hdr->b_spa); + + if (flags & GB_NODUMP) + data = zio_data_buf_alloc(size); + else + data = zio_buf_alloc(size); + newbp = malloc(sizeof(struct buf), M_ZFS_BIO, M_WAITOK|M_ZERO); + newbp->b_data = data; + newbp->b_flags = (B_MALLOC|B_INVAL); + newbp->b_bcount = size; + if (!BUF_EMPTY(hdr) && !(hdr->b_flags & ZBIO_BUF_CLONING)) { + blkno = hdr->b_dva.dva_word[1] & ~(1ULL<<63); + zbio_buf_blkno_evict_overlap(blkno, size, state, txg, 0, FALSE); + newbp->b_blkno = blkno; + /* + * Copy in from the page cache if found & valid + * and mark B_CACHE + */ + zbio_buf_vm_object_copyin(newbp); + } + + if (hdr->b_flags & ZBIO_BUF_CLONING) { + newbp->b_flags |= B_CLONED; + hdr->b_flags &= ~ZBIO_BUF_CLONING; + } + zbio_buf_va_insert(newbp, state); +} + +static buf_t * +_zbio_getblk_vmio(zbio_buf_hdr_t *hdr, int flags) +{ + buf_t *newbp; + daddr_t blkno; + uint64_t size = hdr->b_size; + spa_t *spa = hdr->b_spa; + zbio_state_t *state = spa_get_bio_state(spa); + struct vnode *vp = spa_get_vnode(spa); + struct bufobj *bo = &vp->v_bufobj; + + if (BUF_EMPTY(hdr)) { + newbp = geteblk(size, flags); + zbio_buf_va_insert(newbp, state); + } else { + blkno = hdr->b_dva.dva_word[1] & ~(1ULL<<63); + zbio_buf_blkno_evict_overlap(blkno, size, state, 0, 0, FALSE); + + while (newbp == NULL) + newbp = getblk(vp, blkno, size, 0, 0, flags | GB_LOCK_NOWAIT); + brelvp(newbp); + newbp->b_flags |= B_ASSIGNED; + zbio_buf_blkno_insert(newbp, state); + } + newbp->b_bufobj = bo; + BUF_KERNPROC(newbp); + CTR4(KTR_SPARE2, "arc_getblk() bp=%p flags %X " + "blkno %ld npages %d", + newbp, newbp->b_flags, blkno, newbp->b_npages); + + return (newbp); +} + +static void +_zbio_getblk(arc_buf_t *buf, int flags) +{ + zbio_buf_hdr_t *hdr = (zbio_buf_hdr_t *)buf->b_hdr; + uint64_t size = hdr->b_size; + buf_t *newbp; + + if (zfs_page_cache_disable) { + buf->b_data = zio_buf_alloc(size); + hdr->b_flags &= ~ZBIO_BUF_CLONING; + return; + } + + if ((size & PAGE_MASK) || (hdr->b_flags & ZBIO_BUF_CLONING)) + newbp = _zbio_getblk_malloc(hdr, flags); + else + newbp = _zbio_getblk_vmio(hdr, flags); + + buf->b_bp = newbp; + buf->b_data = newbp->b_data; + newbp->b_arc_buf = buf; +} + +void +zbio_getblk(arc_buf_t *buf) +{ + + _zbio_getblk(buf, 0); +} + +void +zbio_data_getblk(arc_buf_t *buf) +{ + + _zbio_getblk(buf, GB_NODUMP); +} + +void +zbio_relse(arc_buf_t *buf, size_t size) +{ + struct buf *bp = buf->b_bp; + + if (zfs_page_cache_disable) { + zio_buf_free(buf->b_data, size); return; + } + + if (bp->b_flags & B_ASSIGNED) + zbio_buf_blkno_remove(bp); + else + zbio_buf_va_remove(bp); + + if (bp->b_flags & B_MALLOC) { + zio_buf_free(bp->b_data, size); + free(bp, M_ZFS_BIO); + } else { + CTR4(KTR_SPARE2, "arc_brelse() bp=%p flags %X" + " size %ld blkno=%ld", + bp, bp->b_flags, size, bp->b_blkno); + + bp->b_flags |= B_ZFS; + brelse(bp); + } +} - if (blkno == 0 || hdr->b_birth == 0) +void +zbio_sync_cache(spa_t *spa, blkptr_t *blkp, uint64_t txg, void *data, uint64_t size, int bio_op) +{ + buf_t *bp; + zbio_state_t *state = spa_get_bio_state(spa); + dva_t dva = *BP_IDENTITY(blkp); + daddr_t blkno = dva.dva_word[1] & ~(1ULL<<63); + struct vnode *vp = spa_get_vnode(spa); + vm_object_t object = vp->v_object; + vm_pindex_t start; + vm_page_t m; + int i; + + if (zfs_page_cache_disable) return; + /* + * XXX incomplete + */ - bp = buf->b_bp; - bp->b_birth = hdr->b_birth; - bp->b_blkno = bp->b_lblkno = blkno; - bp->b_offset = (blkno << 9); - cachebuf = ((hdr->b_datacnt == 1) && - !(hdr->b_flags & ARC_IO_ERROR) && - ((bp->b_flags & (B_INVAL|B_CACHE)) == B_CACHE) && - (blkno & 0x7) == 0); - - arc_binval(hdr->b_spa, &hdr->b_dva, hdr->b_size); - if (cachebuf) - arc_pcache(vp, bp, blkno); + + if ((bp = zbio_buf_va_lookup(state, data)) != NULL) { + KASSERT(bp->b_flags & (B_CLONED|B_EVICTED) == 0, + ("doing I/O with cloned or evicted buffer 0x%x", bp->b_flags)); + + if (bp->b_flags & B_MALLOC) { + zbio_buf_blkno_evict_overlap(blkno, size, state, txg, 0, FALSE); + + if (bio_op == BIO_READ) { + /* + * if page resident - copy in + * update zio pipeline + */ + zbio_buf_vm_object_copyin(bp); + if (bp->b_flags & B_CACHE) { + /* update zio pipeline */ + } + } else + zbio_buf_vm_object_copyout(bp); + } else { + zbio_buf_blkno_evict_overlap(blkno, size, state, 0, ZB_EVICT_ALL, TRUE); + bp->b_blkno = bp->b_lblkno = blkno; + bp->b_flags |= (B_VMIO|B_ASSIGNED); + zbio_buf_vm_object_insert(bp, bio_op == BIO_WRITE); + } + } else { + bp = zbio_buf_blkno_lookup(state, blkno); + KASSERT(bp != NULL, ("blkno=%ld data=%p unmanaged", blkno, bp->b_data)); + } } -#endif static void zbio_shutdown(void *arg __unused, int howto __unused) @@ -275,6 +881,9 @@ void zbio_init(void) { + if (zfs_page_cache_disable) + return; + zbio_event_shutdown = EVENTHANDLER_REGISTER(shutdown_pre_sync, zbio_shutdown, NULL, EVENTHANDLER_PRI_FIRST); } @@ -285,7 +894,9 @@ zbio_fini(void) if (zbio_event_shutdown != NULL) EVENTHANDLER_DEREGISTER(shutdown_pre_sync, zbio_event_shutdown); } -#else + + +#else /* !_KERNEL */ void zbio_getblk(arc_buf_t *buf) @@ -319,5 +930,5 @@ zbio_sync_cache(spa_t *spa, blkptr_t *bp { ; } - #endif + Modified: user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c ============================================================================== --- user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c Sat Dec 12 02:34:00 2009 (r200427) +++ user/kmacy/releng_8_fcs_buf_xen/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c Sat Dec 12 03:35:49 2009 (r200428) @@ -436,8 +436,9 @@ zio_create(zio_t *pio, spa_t *spa, uint6 if (bp != NULL) { - if ((vd == NULL) || (vd->vdev_parent == NULL)) - zbio_sync_cache(spa, bp, txg, size); + if (((vd == NULL) || (vd->vdev_parent == NULL)) && + ((type == ZIO_TYPE_WRITE) || (type == ZIO_TYPE_READ))) + zbio_sync_cache(spa, bp, txg, data, size, type == ZIO_TYPE_WRITE ? BIO_WRITE : BIO_READ); zio->io_bp = bp; zio->io_bp_copy = *bp;
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200912120335.nBC3Zn0Z029042>