From owner-svn-src-user@FreeBSD.ORG Sun Oct 25 00:42:03 2009 Return-Path: Delivered-To: svn-src-user@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id 65E49106566C; Sun, 25 Oct 2009 00:42:03 +0000 (UTC) (envelope-from kmacy@FreeBSD.org) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:4f8:fff6::2c]) by mx1.freebsd.org (Postfix) with ESMTP id 537CB8FC0A; Sun, 25 Oct 2009 00:42:03 +0000 (UTC) Received: from svn.freebsd.org (localhost [127.0.0.1]) by svn.freebsd.org (8.14.3/8.14.3) with ESMTP id n9P0g3g5020919; Sun, 25 Oct 2009 00:42:03 GMT (envelope-from kmacy@svn.freebsd.org) Received: (from kmacy@localhost) by svn.freebsd.org (8.14.3/8.14.3/Submit) id n9P0g3sO020912; Sun, 25 Oct 2009 00:42:03 GMT (envelope-from kmacy@svn.freebsd.org) Message-Id: <200910250042.n9P0g3sO020912@svn.freebsd.org> From: Kip Macy Date: Sun, 25 Oct 2009 00:42:03 +0000 (UTC) To: src-committers@freebsd.org, svn-src-user@freebsd.org X-SVN-Group: user MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cc: Subject: svn commit: r198458 - in user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs: . sys X-BeenThere: svn-src-user@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: "SVN commit messages for the experimental " user" src tree" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sun, 25 Oct 2009 00:42:03 -0000 Author: kmacy Date: Sun Oct 25 00:42:03 2009 New Revision: 198458 URL: http://svn.freebsd.org/changeset/base/198458 Log: initial support for backing the ARC cache by the page cache Modified: user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c Modified: user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c ============================================================================== --- user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Sun Oct 25 00:37:59 2009 (r198457) +++ user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Sun Oct 25 00:42:03 2009 (r198458) @@ -126,6 +126,7 @@ #include #ifdef _KERNEL #include +#include #endif #include #include @@ -258,6 +259,7 @@ static arc_state_t ARC_l2c_only; typedef struct arc_stats { kstat_named_t arcstat_hits; + kstat_named_t arcstat_page_cache_hits; kstat_named_t arcstat_misses; kstat_named_t arcstat_demand_data_hits; kstat_named_t arcstat_demand_data_misses; @@ -307,6 +309,7 @@ typedef struct arc_stats { static arc_stats_t arc_stats = { { "hits", KSTAT_DATA_UINT64 }, + { "page_cache_hits", KSTAT_DATA_UINT64 }, { "misses", KSTAT_DATA_UINT64 }, { "demand_data_hits", KSTAT_DATA_UINT64 }, { "demand_data_misses", KSTAT_DATA_UINT64 }, @@ -512,6 +515,7 @@ static void arc_evict_ghost(arc_state_t #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ #define ARC_STORED (1 << 19) /* has been store()d to */ +#define ARC_BUF_CLONING (1 << 21) /* is being cloned */ #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) @@ -632,9 +636,10 @@ struct l2arc_buf_hdr { typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ + arc_buf_t *l2df_buf; void *l2df_data; size_t l2df_size; - void (*l2df_func)(void *, size_t); + void (*l2df_func)(arc_buf_t *, void *, size_t); list_node_t l2df_list_node; } l2arc_data_free_t; @@ -1190,8 +1195,8 @@ arc_data_buf_free(void *buf, uint64_t si atomic_add_64(&arc_size, -size); } -arc_buf_t * -arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) +static arc_buf_t * +_arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type, dva_t dva) { arc_buf_hdr_t *hdr; arc_buf_t *buf; @@ -1201,6 +1206,7 @@ arc_buf_alloc(spa_t *spa, int size, void ASSERT(BUF_EMPTY(hdr)); hdr->b_size = size; hdr->b_type = type; + hdr->b_dva = dva; hdr->b_spa = spa; hdr->b_state = arc_anon; hdr->b_arc_access = 0; @@ -1220,6 +1226,14 @@ arc_buf_alloc(spa_t *spa, int size, void return (buf); } +arc_buf_t * +arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) +{ + dva_t dva = {0ULL, 0ULL}; + + return (_arc_buf_alloc(spa, size, tag, type, dva)); +} + static arc_buf_t * arc_buf_clone(arc_buf_t *from) { @@ -1234,8 +1248,11 @@ arc_buf_clone(arc_buf_t *from) buf->b_private = NULL; buf->b_next = hdr->b_buf; hdr->b_buf = buf; + hdr->b_flags |= ARC_BUF_CLONING; arc_get_data_buf(buf); +#ifdef nomore bcopy(from->b_data, buf->b_data, size); +#endif hdr->b_datacnt += 1; return (buf); } @@ -1272,17 +1289,95 @@ arc_buf_add_ref(arc_buf_t *buf, void* ta data, metadata, hits); } +static void +arc_getblk(arc_buf_t *buf) +{ + uint64_t size = buf->b_hdr->b_size; + arc_buf_contents_t type = buf->b_hdr->b_type; + spa_t *spa = buf->b_hdr->b_spa; + off_t blkno = buf->b_hdr->b_dva.dva_word[1] & ~(1UL<<63); + struct buf *newbp, *bp; + arc_buf_t *tbuf; + struct vnode *vp; + int flags = 0; + + if (type == ARC_BUFC_METADATA) { + arc_space_consume(size); + } else { + ASSERT(type == ARC_BUFC_DATA); + flags = GB_NODUMP; + atomic_add_64(&arc_size, size); + } + + if (buf->b_hdr->b_flags & ARC_BUF_CLONING) { + newbp = geteblk(size, flags); + tbuf = buf; + + while (tbuf->b_next != NULL) + tbuf = tbuf->b_next; + bp = tbuf->b_bp; + vp = spa_get_vnode(spa); + + KASSERT((bp->b_blkno == bp->b_lblkno) && + (bp->b_blkno == blkno), + ("blkno mismatch b_blkno %ld b_lblkno %ld blkno %ld", + bp->b_blkno, bp->b_lblkno, blkno)); + newbp->b_bufobj = &vp->v_bufobj; + newbp->b_lblkno = blkno; + newbp->b_blkno = blkno; + newbp->b_offset = (blkno<<9); + + if (bp->b_vp != NULL) { + KASSERT(bp->b_xflags & BX_VNCLEAN, ("brelvp() on buffer that is not in splay")); + brelvp(bp); + } + + BO_LOCK(&vp->v_bufobj); + bgetvp(vp, newbp); + BO_UNLOCK(&vp->v_bufobj); + newbp->b_flags &= ~B_INVAL; + newbp->b_flags |= B_CACHE; + bp->b_flags |= B_INVAL; + bp->b_flags &= ~B_CACHE; + bcopy(bp->b_data, newbp->b_data, size); + buf->b_hdr->b_flags &= ~ARC_BUF_CLONING; + + } else if (BUF_EMPTY(buf->b_hdr)) { + newbp = geteblk(size, flags); + } else + newbp = getblk(spa_get_vnode(spa), blkno, + size, 0, 0, flags); + CTR2(KTR_BUF, "arc_getblk() bp=%p flags %X", + newbp, newbp->b_flags); + + BUF_KERNPROC(newbp); + buf->b_bp = newbp; + buf->b_data = newbp->b_data; +} + +static void +arc_brelse(arc_buf_t *buf, void *data, size_t size) +{ + +#ifdef INVARIANTS + if (buf->b_bp->b_vp) + KASSERT(buf->b_bp->b_xflags & BX_VNCLEAN, ("brelse() on buffer that is not in splay")); +#endif + brelse(buf->b_bp); +} + /* * Free the arc data buffer. If it is an l2arc write in progress, * the buffer is placed on l2arc_free_on_write to be freed later. */ static void -arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), - void *data, size_t size) +arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(arc_buf_t *, void *, size_t), + arc_buf_t *buf, void *data, size_t size) { if (HDR_L2_WRITING(hdr)) { l2arc_data_free_t *df; df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); + df->l2df_buf = buf; df->l2df_data = data; df->l2df_size = size; df->l2df_func = free_func; @@ -1291,7 +1386,7 @@ arc_buf_data_free(arc_buf_hdr_t *hdr, vo mutex_exit(&l2arc_free_on_write_mtx); ARCSTAT_BUMP(arcstat_l2_free_on_write); } else { - free_func(data, size); + free_func(buf, data, size); } } @@ -1309,13 +1404,13 @@ arc_buf_destroy(arc_buf_t *buf, boolean_ arc_cksum_verify(buf); if (!recycle) { if (type == ARC_BUFC_METADATA) { - arc_buf_data_free(buf->b_hdr, zio_buf_free, - buf->b_data, size); + arc_buf_data_free(buf->b_hdr, arc_brelse, + buf, buf->b_data, size); arc_space_return(size); } else { ASSERT(type == ARC_BUFC_DATA); - arc_buf_data_free(buf->b_hdr, - zio_data_buf_free, buf->b_data, size); + arc_buf_data_free(buf->b_hdr, arc_brelse, + buf, buf->b_data, size); atomic_add_64(&arc_size, -size); } } @@ -1514,7 +1609,7 @@ arc_buf_size(arc_buf_t *buf) * it can't get a hash_lock on, and so may not catch all candidates. * It may also return without evicting as much space as requested. */ -static void * +static struct buf * arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, arc_buf_contents_t type) { @@ -1526,11 +1621,12 @@ arc_evict(arc_state_t *state, spa_t *spa kmutex_t *lock, *evicted_lock; kmutex_t *hash_lock; boolean_t have_lock; - void *stolen = NULL; + struct buf *stolen = NULL; static int evict_metadata_offset, evict_data_offset; int idx, offset, list_count, count; ASSERT(state == arc_mru || state == arc_mfu); + ASSERT(recycle == FALSE); evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; @@ -1598,7 +1694,7 @@ evict_start: if (buf->b_efunc) { mutex_enter(&arc_eviction_mtx); arc_buf_destroy(buf, - buf->b_data == stolen, FALSE); + buf->b_bp == stolen, FALSE); ab->b_buf = buf->b_next; buf->b_hdr = &arc_eviction_hdr; buf->b_next = arc_eviction_list; @@ -1608,7 +1704,7 @@ evict_start: } else { rw_exit(&buf->b_lock); arc_buf_destroy(buf, - buf->b_data == stolen, TRUE); + buf->b_bp == stolen, TRUE); } } if (ab->b_datacnt == 0) { @@ -2267,14 +2363,7 @@ arc_get_data_buf(arc_buf_t *buf) * just allocate a new buffer. */ if (!arc_evict_needed(type)) { - if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); - arc_space_consume(size); - } else { - ASSERT(type == ARC_BUFC_DATA); - buf->b_data = zio_data_buf_alloc(size); - atomic_add_64(&arc_size, size); - } + arc_getblk(buf); goto out; } @@ -2297,17 +2386,8 @@ arc_get_data_buf(arc_buf_t *buf) state = (arc_mru->arcs_lsize[type] > 0 && mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; } - if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { - if (type == ARC_BUFC_METADATA) { - buf->b_data = zio_buf_alloc(size); - arc_space_consume(size); - } else { - ASSERT(type == ARC_BUFC_DATA); - buf->b_data = zio_data_buf_alloc(size); - atomic_add_64(&arc_size, size); - } - ARCSTAT_BUMP(arcstat_recycle_miss); - } + (void) arc_evict(state, NULL, size, FALSE, type); + arc_getblk(buf); ASSERT(buf->b_data != NULL); out: /* @@ -2528,6 +2608,9 @@ arc_read_done(zio_t *zio) arc_cksum_compute(buf, B_FALSE); + buf->b_bp->b_flags &= ~B_INVAL; + buf->b_bp->b_flags |= B_CACHE; + /* create copies of the data buffer for the callers */ abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { @@ -2734,9 +2817,9 @@ top: /* this block is not in the cache */ arc_buf_hdr_t *exists; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); - buf = arc_buf_alloc(spa, size, private, type); + buf = _arc_buf_alloc(spa, size, private, type, + *BP_IDENTITY(bp)); hdr = buf->b_hdr; - hdr->b_dva = *BP_IDENTITY(bp); hdr->b_birth = bp->blk_birth; hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; exists = buf_hash_insert(hdr, &hash_lock); @@ -2783,7 +2866,19 @@ top: arc_get_data_buf(buf); ASSERT(hdr->b_datacnt == 0); hdr->b_datacnt = 1; - + } + /* + * We hit in the page cache + * + */ + if ((buf->b_bp->b_flags & (B_CACHE|B_INVAL)) == B_CACHE) { + /* + * track the number of times + * the buffer was found in the cache + */ + ARCSTAT_BUMP(arcstat_page_cache_hits); + mutex_exit(hash_lock); + goto top; } acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); @@ -3211,7 +3306,6 @@ arc_write_done(zio_t *zio) arc_buf_hdr_t *hdr = buf->b_hdr; hdr->b_acb = NULL; - hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_birth = zio->io_bp->blk_birth; hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; @@ -3224,7 +3318,32 @@ arc_write_done(zio_t *zio) if (!BUF_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; + /* + * Associate buffer with offset in the page cache + */ + struct buf *bp = buf->b_bp; + struct vnode *vp = spa_get_vnode(hdr->b_spa); + off_t blkno = hdr->b_dva.dva_word[1] & ~(1UL<<63); + + CTR2(KTR_BUF, "arc_write_done(%p) flags %X", + bp, bp->b_flags); + + if ((hdr->b_buf == buf) && + (bp->b_bufobj == NULL)) { + + bp->b_bufobj = &vp->v_bufobj; + bp->b_lblkno = blkno; + bp->b_blkno = blkno; + bp->b_offset = (blkno << 9); + BO_LOCK(bp->b_bufobj); + bgetvp(vp, bp); + BO_UNLOCK(bp->b_bufobj); + bp->b_flags &= ~B_INVAL; + bp->b_flags |= B_CACHE; + } + /* + */ arc_cksum_verify(buf); exists = buf_hash_insert(hdr, &hash_lock); @@ -3987,7 +4106,7 @@ l2arc_do_free_on_write() df_prev = list_prev(buflist, df); ASSERT(df->l2df_data != NULL); ASSERT(df->l2df_func != NULL); - df->l2df_func(df->l2df_data, df->l2df_size); + df->l2df_func(df->l2df_buf, df->l2df_data, df->l2df_size); list_remove(buflist, df); kmem_free(df, sizeof (l2arc_data_free_t)); } Modified: user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c ============================================================================== --- user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c Sun Oct 25 00:37:59 2009 (r198457) +++ user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c Sun Oct 25 00:42:03 2009 (r198458) @@ -4299,3 +4299,10 @@ done: #endif #endif } + +struct vnode * +spa_get_vnode(spa_t *spa) +{ + + return (spa->spa_root_vdev->vdev_vnode); +} Modified: user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h ============================================================================== --- user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h Sun Oct 25 00:37:59 2009 (r198457) +++ user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h Sun Oct 25 00:42:03 2009 (r198458) @@ -52,6 +52,7 @@ struct arc_buf { void *b_data; arc_evict_func_t *b_efunc; void *b_private; + struct buf *b_bp; }; typedef enum arc_buf_contents { Modified: user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h ============================================================================== --- user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h Sun Oct 25 00:37:59 2009 (r198457) +++ user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h Sun Oct 25 00:42:03 2009 (r198458) @@ -534,6 +534,8 @@ extern void spa_prop_clear_bootfs(spa_t /* asynchronous event notification */ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name); +extern struct vnode *spa_get_vnode(spa_t *spa); + #ifdef ZFS_DEBUG #define dprintf_bp(bp, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ Modified: user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h ============================================================================== --- user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h Sun Oct 25 00:37:59 2009 (r198457) +++ user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h Sun Oct 25 00:42:03 2009 (r198458) @@ -144,7 +144,8 @@ struct vdev { list_node_t vdev_state_dirty_node; /* state dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ uint64_t vdev_islog; /* is an intent log device */ - + struct vnode *vdev_vnode; /* container for page cache */ + /* * Leaf vdev state. */ Modified: user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c ============================================================================== --- user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c Sun Oct 25 00:37:59 2009 (r198457) +++ user/kmacy/releng_8_fcs_buf/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c Sun Oct 25 00:42:03 2009 (r198458) @@ -1060,10 +1060,24 @@ vdev_open(vdev_t *vd) * inconsistently account for existing bp's. */ if (vd->vdev_top == vd) { + struct vnode *vp; + vd->vdev_deflate_ratio = (1<<17) / (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT); + } + if (vd->vdev_parent == NULL) { + struct vnode *vp; + error = getnewvnode("zpool" , NULL, &dead_vnodeops, &vp); + if (error != 0) + return (error); + + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + vnode_create_vobject(vp, 512, curthread); + vd->vdev_vnode = vp; + VOP_UNLOCK(vp, 0); + } /* * If a leaf vdev has a DTL, and seems healthy, then kick off a * resilver. But don't do this if we are doing a reopen for a @@ -1192,6 +1206,8 @@ vdev_close(vdev_t *vd) else vd->vdev_state = VDEV_STATE_CLOSED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; + + vn_free(vd->vdev_vnode); } void