Date: Fri, 20 Mar 2015 18:39:08 +0000 (UTC) From: Xin LI <delphij@FreeBSD.org> To: src-committers@freebsd.org, svn-src-user@freebsd.org Subject: svn commit: r280296 - in user/delphij/zfs-arc-rebase: cddl/contrib/opensolaris/cmd/ztest sys/cddl/contrib/opensolaris/uts/common/fs/zfs sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys Message-ID: <201503201839.t2KId8x1086495@svn.freebsd.org>
next in thread | raw e-mail | index | archive | help
Author: delphij Date: Fri Mar 20 18:39:07 2015 New Revision: 280296 URL: https://svnweb.freebsd.org/changeset/base/280296 Log: MFV r277426: Reduce L2ARC memory footprint by extracting L1 and L2 only fields and only allocate the fields when needed. Illumos issue: 5408 managing ZFS cache devices requires lots of RAM Modified: user/delphij/zfs-arc-rebase/cddl/contrib/opensolaris/cmd/ztest/ztest.c user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h Directory Properties: user/delphij/zfs-arc-rebase/cddl/contrib/opensolaris/ (props changed) user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/ (props changed) Modified: user/delphij/zfs-arc-rebase/cddl/contrib/opensolaris/cmd/ztest/ztest.c ============================================================================== --- user/delphij/zfs-arc-rebase/cddl/contrib/opensolaris/cmd/ztest/ztest.c Fri Mar 20 18:02:56 2015 (r280295) +++ user/delphij/zfs-arc-rebase/cddl/contrib/opensolaris/cmd/ztest/ztest.c Fri Mar 20 18:39:07 2015 (r280296) @@ -3907,7 +3907,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *z * assign an arcbuf to a dbuf. */ for (j = 0; j < s; j++) { - if (i != 5) { + if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { bigbuf_arcbufs[j] = dmu_request_arcbuf(bonus_db, chunksize); } else { @@ -3931,7 +3931,8 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *z umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); for (j = 0; j < s; j++) { - if (i != 5) { + if (i != 5 || + chunksize < (SPA_MINBLOCKSIZE * 2)) { dmu_return_arcbuf(bigbuf_arcbufs[j]); } else { dmu_return_arcbuf( @@ -3975,7 +3976,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *z } for (off = bigoff, j = 0; j < s; j++, off += chunksize) { dmu_buf_t *dbt; - if (i != 5) { + if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { bcopy((caddr_t)bigbuf + (off - bigoff), bigbuf_arcbufs[j]->b_data, chunksize); } else { @@ -3992,7 +3993,7 @@ ztest_dmu_read_write_zcopy(ztest_ds_t *z VERIFY(dmu_buf_hold(os, bigobj, off, FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); } - if (i != 5) { + if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { dmu_assign_arcbuf(bonus_db, off, bigbuf_arcbufs[j], tx); } else { Modified: user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c ============================================================================== --- user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Fri Mar 20 18:02:56 2015 (r280295) +++ user/delphij/zfs-arc-rebase/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Fri Mar 20 18:39:07 2015 (r280296) @@ -111,7 +111,7 @@ * Note that the majority of the performance stats are manipulated * with atomic operations. * - * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: + * The L2ARC uses the l2ad_mtx on each vdev for the following: * * - L2ARC buflist creation * - L2ARC buflist eviction @@ -390,6 +390,7 @@ typedef struct arc_stats { kstat_named_t arcstat_l2_writes_hdr_miss; kstat_named_t arcstat_l2_evict_lock_retry; kstat_named_t arcstat_l2_evict_reading; + kstat_named_t arcstat_l2_evict_l1cached; kstat_named_t arcstat_l2_free_on_write; kstat_named_t arcstat_l2_abort_lowmem; kstat_named_t arcstat_l2_cksum_bad; @@ -471,6 +472,7 @@ static arc_stats_t arc_stats = { { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, { "l2_evict_reading", KSTAT_DATA_UINT64 }, + { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, { "l2_free_on_write", KSTAT_DATA_UINT64 }, { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, { "l2_cksum_bad", KSTAT_DATA_UINT64 }, @@ -574,8 +576,6 @@ static int arc_no_grow; /* Don't try to static uint64_t arc_tempreserve; static uint64_t arc_loaned_bytes; -typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; - typedef struct arc_callback arc_callback_t; struct arc_callback { @@ -596,29 +596,53 @@ struct arc_write_callback { arc_buf_t *awcb_buf; }; -struct arc_buf_hdr { - /* protected by hash lock */ - dva_t b_dva; - uint64_t b_birth; - uint64_t b_cksum0; - +/* + * ARC buffers are separated into multiple structs as a memory saving measure: + * - Common fields struct, always defined, and embedded within it: + * - L2-only fields, always allocated but undefined when not in L2ARC + * - L1-only fields, only allocated when in L1ARC + * + * Buffer in L1 Buffer only in L2 + * +------------------------+ +------------------------+ + * | arc_buf_hdr_t | | arc_buf_hdr_t | + * | | | | + * | | | | + * | | | | + * +------------------------+ +------------------------+ + * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | + * | (undefined if L1-only) | | | + * +------------------------+ +------------------------+ + * | l1arc_buf_hdr_t | + * | | + * | | + * | | + * | | + * +------------------------+ + * + * Because it's possible for the L2ARC to become extremely large, we can wind + * up eating a lot of memory in L2ARC buffer headers, so the size of a header + * is minimized by only allocating the fields necessary for an L1-cached buffer + * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and + * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple + * words in pointers. arc_hdr_realloc() is used to switch a header between + * these two allocation states. + */ +typedef struct l1arc_buf_hdr { kmutex_t b_freeze_lock; - zio_cksum_t *b_freeze_cksum; +#ifdef ZFS_DEBUG + /* + * used for debugging wtih kmem_flags - by allocating and freeing + * b_thawed when the buffer is thawed, we get a record of the stack + * trace that thawed it. + */ void *b_thawed; +#endif - arc_buf_hdr_t *b_hash_next; arc_buf_t *b_buf; - arc_flags_t b_flags; uint32_t b_datacnt; - - arc_callback_t *b_acb; + /* for waiting on writes to complete */ kcondvar_t b_cv; - /* immutable */ - arc_buf_contents_t b_type; - uint64_t b_size; - uint64_t b_spa; - /* protected by arc state mutex */ arc_state_t *b_state; list_node_t b_arc_node; @@ -629,8 +653,46 @@ struct arc_buf_hdr { /* self protecting */ refcount_t b_refcnt; - l2arc_buf_hdr_t *b_l2hdr; + arc_callback_t *b_acb; + /* temporary buffer holder for in-flight compressed data */ + void *b_tmp_cdata; +} l1arc_buf_hdr_t; + +typedef struct l2arc_dev l2arc_dev_t; + +typedef struct l2arc_buf_hdr { + /* protected by arc_buf_hdr mutex */ + l2arc_dev_t *b_dev; /* L2ARC device */ + uint64_t b_daddr; /* disk address, offset byte */ + /* real alloc'd buffer size depending on b_compress applied */ + int32_t b_asize; + list_node_t b_l2node; +} l2arc_buf_hdr_t; + +struct arc_buf_hdr { + /* protected by hash lock */ + dva_t b_dva; + uint64_t b_birth; + /* + * Even though this checksum is only set/verified when a buffer is in + * the L1 cache, it needs to be in the set of common fields because it + * must be preserved from the time before a buffer is written out to + * L2ARC until after it is read back in. + */ + zio_cksum_t *b_freeze_cksum; + + arc_buf_hdr_t *b_hash_next; + arc_flags_t b_flags; + + /* immutable */ + int32_t b_size; + uint64_t b_spa; + + /* L2ARC fields. Undefined when not in L2ARC. */ + l2arc_buf_hdr_t b_l2hdr; + /* L1ARC fields. Undefined when in l2arc_only state */ + l1arc_buf_hdr_t b_l1hdr; }; #ifdef _KERNEL @@ -667,22 +729,38 @@ static arc_buf_hdr_t arc_eviction_hdr; #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) -#define HDR_FREE_IN_PROGRESS(hdr) \ - ((hdr)->b_flags & ARC_FLAG_FREE_IN_PROGRESS) + #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) +#define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) #define HDR_L2_READING(hdr) \ - ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS && \ - (hdr)->b_l2hdr != NULL) + (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ + ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) +#define HDR_ISTYPE_METADATA(hdr) \ + ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) +#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) + +#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) +#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) + +/* For storing compression mode in b_flags */ +#define HDR_COMPRESS_OFFSET 24 +#define HDR_COMPRESS_NBITS 7 + +#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET(hdr->b_flags, \ + HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS)) +#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET(hdr->b_flags, \ + HDR_COMPRESS_OFFSET, HDR_COMPRESS_NBITS, (cmp)) + /* * Other sizes */ -#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) -#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) +#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) +#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) /* * Hash table routines @@ -806,7 +884,7 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_onl /* * L2ARC Internals */ -typedef struct l2arc_dev { +struct l2arc_dev { vdev_t *l2ad_vdev; /* vdev */ spa_t *l2ad_spa; /* spa */ uint64_t l2ad_hand; /* next write location */ @@ -815,15 +893,15 @@ typedef struct l2arc_dev { uint64_t l2ad_evict; /* last addr eviction reached */ boolean_t l2ad_first; /* first sweep through */ boolean_t l2ad_writing; /* currently writing */ - list_t *l2ad_buflist; /* buffer list */ + kmutex_t l2ad_mtx; /* lock for buffer list */ + list_t l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ -} l2arc_dev_t; +}; static list_t L2ARC_dev_list; /* device list */ static list_t *l2arc_dev_list; /* device list pointer */ static kmutex_t l2arc_dev_mtx; /* device list mutex */ static l2arc_dev_t *l2arc_dev_last; /* last device used */ -static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ static list_t L2ARC_free_on_write; /* free after write buf list */ static list_t *l2arc_free_on_write; /* free after write list ptr */ static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ @@ -843,18 +921,6 @@ typedef struct l2arc_write_callback { arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ } l2arc_write_callback_t; -struct l2arc_buf_hdr { - /* protected by arc_buf_hdr mutex */ - l2arc_dev_t *b_dev; /* L2ARC device */ - uint64_t b_daddr; /* disk address, offset byte */ - /* compression applied to buffer data */ - enum zio_compress b_compress; - /* real alloc'd buffer size depending on b_compress applied */ - int b_asize; - /* temporary buffer holder for in-flight compressed data */ - void *b_tmp_cdata; -}; - typedef struct l2arc_data_free { /* protected by l2arc_free_on_write_mtx */ void *l2df_data; @@ -873,12 +939,13 @@ static int arc_evict_needed(arc_buf_cont static void arc_evict_ghost(arc_state_t *, uint64_t, int64_t); static void arc_buf_watch(arc_buf_t *); +static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); +static uint32_t arc_bufc_to_flags(arc_buf_contents_t); + static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); static void l2arc_read_done(zio_t *); -static void l2arc_hdr_stat_add(void); -static void l2arc_hdr_stat_remove(void); -static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *); +static boolean_t l2arc_compress_buf(arc_buf_hdr_t *); static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); static void l2arc_release_cdata_buf(arc_buf_hdr_t *); @@ -901,8 +968,7 @@ buf_hash(uint64_t spa, const dva_t *dva, #define BUF_EMPTY(buf) \ ((buf)->b_dva.dva_word[0] == 0 && \ - (buf)->b_dva.dva_word[1] == 0 && \ - (buf)->b_cksum0 == 0) + (buf)->b_dva.dva_word[1] == 0) #define BUF_EQUAL(spa, dva, birth, buf) \ ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ @@ -915,7 +981,6 @@ buf_discard_identity(arc_buf_hdr_t *hdr) hdr->b_dva.dva_word[0] = 0; hdr->b_dva.dva_word[1] = 0; hdr->b_birth = 0; - hdr->b_cksum0 = 0; } static arc_buf_hdr_t * @@ -945,6 +1010,7 @@ buf_hash_find(uint64_t spa, const blkptr * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. + * If lockp == NULL, the caller is assumed to already hold the hash lock. */ static arc_buf_hdr_t * buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) @@ -957,8 +1023,14 @@ buf_hash_insert(arc_buf_hdr_t *hdr, kmut ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); ASSERT(hdr->b_birth != 0); ASSERT(!HDR_IN_HASH_TABLE(hdr)); - *lockp = hash_lock; - mutex_enter(hash_lock); + + if (lockp != NULL) { + *lockp = hash_lock; + mutex_enter(hash_lock); + } else { + ASSERT(MUTEX_HELD(hash_lock)); + } + for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; fhdr = fhdr->b_hash_next, i++) { if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) @@ -1013,7 +1085,8 @@ buf_hash_remove(arc_buf_hdr_t *hdr) /* * Global data structures and functions for the buf kmem cache. */ -static kmem_cache_t *hdr_cache; +static kmem_cache_t *hdr_full_cache; +static kmem_cache_t *hdr_l2only_cache; static kmem_cache_t *buf_cache; static void @@ -1025,7 +1098,8 @@ buf_fini(void) (buf_hash_table.ht_mask + 1) * sizeof (void *)); for (i = 0; i < BUF_LOCKS; i++) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); - kmem_cache_destroy(hdr_cache); + kmem_cache_destroy(hdr_full_cache); + kmem_cache_destroy(hdr_l2only_cache); kmem_cache_destroy(buf_cache); } @@ -1035,15 +1109,27 @@ buf_fini(void) */ /* ARGSUSED */ static int -hdr_cons(void *vbuf, void *unused, int kmflag) +hdr_full_cons(void *vbuf, void *unused, int kmflag) { arc_buf_hdr_t *hdr = vbuf; - bzero(hdr, sizeof (arc_buf_hdr_t)); - refcount_create(&hdr->b_refcnt); - cv_init(&hdr->b_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&hdr->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); - arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); + bzero(hdr, HDR_FULL_SIZE); + cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); + refcount_create(&hdr->b_l1hdr.b_refcnt); + mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); + + return (0); +} + +/* ARGSUSED */ +static int +hdr_l2only_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_hdr_t *hdr = vbuf; + + bzero(hdr, HDR_L2ONLY_SIZE); + arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); return (0); } @@ -1067,15 +1153,25 @@ buf_cons(void *vbuf, void *unused, int k */ /* ARGSUSED */ static void -hdr_dest(void *vbuf, void *unused) +hdr_full_dest(void *vbuf, void *unused) +{ + arc_buf_hdr_t *hdr = vbuf; + + ASSERT(BUF_EMPTY(hdr)); + cv_destroy(&hdr->b_l1hdr.b_cv); + refcount_destroy(&hdr->b_l1hdr.b_refcnt); + mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); + arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); +} + +/* ARGSUSED */ +static void +hdr_l2only_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; ASSERT(BUF_EMPTY(hdr)); - refcount_destroy(&hdr->b_refcnt); - cv_destroy(&hdr->b_cv); - mutex_destroy(&hdr->b_freeze_lock); - arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); + arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); } /* ARGSUSED */ @@ -1129,8 +1225,11 @@ retry: goto retry; } - hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), - 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); + hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, + 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); + hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", + HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, + NULL, NULL, 0); buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); @@ -1144,6 +1243,81 @@ retry: } } +/* + * Transition between the two allocation states for the arc_buf_hdr struct. + * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without + * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller + * version is used when a cache buffer is only in the L2ARC in order to reduce + * memory usage. + */ +static arc_buf_hdr_t * +arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) +{ + ASSERT(HDR_HAS_L2HDR(hdr)); + + arc_buf_hdr_t *nhdr; + l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; + + ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || + (old == hdr_l2only_cache && new == hdr_full_cache)); + + nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); + + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); + buf_hash_remove(hdr); + + bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); + if (new == hdr_full_cache) { + nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; + /* + * arc_access and arc_change_state need to be aware that a + * header has just come out of L2ARC, so we set its state to + * l2c_only even though it's about to change. + */ + nhdr->b_l1hdr.b_state = arc_l2c_only; + } else { + ASSERT(hdr->b_l1hdr.b_buf == NULL); + ASSERT0(hdr->b_l1hdr.b_datacnt); + ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + /* + * We might be removing the L1hdr of a buffer which was just + * written out to L2ARC. If such a buffer is compressed then we + * need to free its b_tmp_cdata before destroying the header. + */ + if (hdr->b_l1hdr.b_tmp_cdata != NULL && + HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) + l2arc_release_cdata_buf(hdr); + nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; + } + /* + * The header has been reallocated so we need to re-insert it into any + * lists it was on. + */ + (void) buf_hash_insert(nhdr, NULL); + + ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); + + mutex_enter(&dev->l2ad_mtx); + + /* + * We must place the realloc'ed header back into the list at + * the same spot. Otherwise, if it's placed earlier in the list, + * l2arc_write_buffers() could find it during the function's + * write phase, and try to write it out to the l2arc. + */ + list_insert_after(&dev->l2ad_buflist, hdr, nhdr); + list_remove(&dev->l2ad_buflist, hdr); + + mutex_exit(&dev->l2ad_mtx); + + buf_discard_identity(hdr); + hdr->b_freeze_cksum = NULL; + kmem_cache_free(old, hdr); + + return (nhdr); +} + + #define ARC_MINTIME (hz>>4) /* 62 ms */ static void @@ -1154,16 +1328,15 @@ arc_cksum_verify(arc_buf_t *buf) if (!(zfs_flags & ZFS_DEBUG_MODIFY)) return; - mutex_enter(&buf->b_hdr->b_freeze_lock); - if (buf->b_hdr->b_freeze_cksum == NULL || - (buf->b_hdr->b_flags & ARC_FLAG_IO_ERROR)) { - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); + if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); return; } fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) panic("buffer modified while frozen!"); - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); } static int @@ -1172,10 +1345,10 @@ arc_cksum_equal(arc_buf_t *buf) zio_cksum_t zc; int equal; - mutex_enter(&buf->b_hdr->b_freeze_lock); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); return (equal); } @@ -1186,15 +1359,15 @@ arc_cksum_compute(arc_buf_t *buf, boolea if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) return; - mutex_enter(&buf->b_hdr->b_freeze_lock); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); return; } buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); fletcher_2_native(buf->b_data, buf->b_hdr->b_size, buf->b_hdr->b_freeze_cksum); - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); #ifdef illumos arc_buf_watch(buf); #endif @@ -1245,30 +1418,58 @@ arc_buf_watch(arc_buf_t *buf) } #endif /* illumos */ +static arc_buf_contents_t +arc_buf_type(arc_buf_hdr_t *hdr) +{ + if (HDR_ISTYPE_METADATA(hdr)) { + return (ARC_BUFC_METADATA); + } else { + return (ARC_BUFC_DATA); + } +} + +static uint32_t +arc_bufc_to_flags(arc_buf_contents_t type) +{ + switch (type) { + case ARC_BUFC_DATA: + /* metadata field is 0 if buffer contains normal data */ + return (0); + case ARC_BUFC_METADATA: + return (ARC_FLAG_BUFC_METADATA); + default: + break; + } + panic("undefined ARC buffer type!"); + return ((uint32_t)-1); +} + void arc_buf_thaw(arc_buf_t *buf) { if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (buf->b_hdr->b_state != arc_anon) + if (buf->b_hdr->b_l1hdr.b_state != arc_anon) panic("modifying non-anon buffer!"); - if (buf->b_hdr->b_flags & ARC_FLAG_IO_IN_PROGRESS) + if (HDR_IO_IN_PROGRESS(buf->b_hdr)) panic("modifying buffer while i/o in progress!"); arc_cksum_verify(buf); } - mutex_enter(&buf->b_hdr->b_freeze_lock); + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); buf->b_hdr->b_freeze_cksum = NULL; } +#ifdef ZFS_DEBUG if (zfs_flags & ZFS_DEBUG_MODIFY) { - if (buf->b_hdr->b_thawed) - kmem_free(buf->b_hdr->b_thawed, 1); - buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP); + if (buf->b_hdr->b_l1hdr.b_thawed != NULL) + kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1); + buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); } +#endif - mutex_exit(&buf->b_hdr->b_freeze_lock); + mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); #ifdef illumos arc_buf_unwatch(buf); @@ -1287,7 +1488,7 @@ arc_buf_freeze(arc_buf_t *buf) mutex_enter(hash_lock); ASSERT(buf->b_hdr->b_freeze_cksum != NULL || - buf->b_hdr->b_state == arc_anon); + buf->b_hdr->b_l1hdr.b_state == arc_anon); arc_cksum_compute(buf, B_FALSE); mutex_exit(hash_lock); @@ -1296,30 +1497,34 @@ arc_buf_freeze(arc_buf_t *buf) static void add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { + ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(MUTEX_HELD(hash_lock)); + arc_state_t *state = hdr->b_l1hdr.b_state; + + if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && + (state != arc_anon)) { + /* We don't use the L2-only state list. */ + if (state != arc_l2c_only) { + uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; + list_t *list = &state->arcs_list[arc_buf_type(hdr)]; + uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; - if ((refcount_add(&hdr->b_refcnt, tag) == 1) && - (hdr->b_state != arc_anon)) { - uint64_t delta = hdr->b_size * hdr->b_datacnt; - list_t *list = &hdr->b_state->arcs_list[hdr->b_type]; - uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; - - ASSERT(!MUTEX_HELD(&hdr->b_state->arcs_mtx)); - mutex_enter(&hdr->b_state->arcs_mtx); - ASSERT(list_link_active(&hdr->b_arc_node)); - list_remove(list, hdr); - if (GHOST_STATE(hdr->b_state)) { - ASSERT0(hdr->b_datacnt); - ASSERT3P(hdr->b_buf, ==, NULL); - delta = hdr->b_size; - } - ASSERT(delta > 0); - ASSERT3U(*size, >=, delta); - atomic_add_64(size, -delta); - mutex_exit(&hdr->b_state->arcs_mtx); + ASSERT(!MUTEX_HELD(&state->arcs_mtx)); + mutex_enter(&state->arcs_mtx); + ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); + list_remove(list, hdr); + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_datacnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + delta = hdr->b_size; + } + ASSERT(delta > 0); + ASSERT3U(*size, >=, delta); + atomic_add_64(size, -delta); + mutex_exit(&state->arcs_mtx); + } /* remove the prefetch flag if we get a reference */ - if (hdr->b_flags & ARC_FLAG_PREFETCH) - hdr->b_flags &= ~ARC_FLAG_PREFETCH; + hdr->b_flags &= ~ARC_FLAG_PREFETCH; } } @@ -1327,21 +1532,27 @@ static int remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) { int cnt; - arc_state_t *state = hdr->b_state; + arc_state_t *state = hdr->b_l1hdr.b_state; + ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); ASSERT(!GHOST_STATE(state)); - if (((cnt = refcount_remove(&hdr->b_refcnt, tag)) == 0) && + /* + * arc_l2c_only counts as a ghost state so we don't need to explicitly + * check to prevent usage of the arc_l2c_only list. + */ + if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && (state != arc_anon)) { - uint64_t *size = &state->arcs_lsize[hdr->b_type]; + uint64_t *size = &state->arcs_lsize[arc_buf_type(hdr)]; ASSERT(!MUTEX_HELD(&state->arcs_mtx)); mutex_enter(&state->arcs_mtx); - ASSERT(!list_link_active(&hdr->b_arc_node)); - list_insert_head(&state->arcs_list[hdr->b_type], hdr); - ASSERT(hdr->b_datacnt > 0); - atomic_add_64(size, hdr->b_size * hdr->b_datacnt); + ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + list_insert_head(&state->arcs_list[arc_buf_type(hdr)], hdr); + ASSERT(hdr->b_l1hdr.b_datacnt > 0); + atomic_add_64(size, hdr->b_size * + hdr->b_l1hdr.b_datacnt); mutex_exit(&state->arcs_mtx); } return (cnt); @@ -1355,40 +1566,60 @@ static void arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { - arc_state_t *old_state = hdr->b_state; - int64_t refcnt = refcount_count(&hdr->b_refcnt); + arc_state_t *old_state; + int64_t refcnt; + uint32_t datacnt; uint64_t from_delta, to_delta; + arc_buf_contents_t buftype = arc_buf_type(hdr); + + /* + * We almost always have an L1 hdr here, since we call arc_hdr_realloc() + * in arc_read() when bringing a buffer out of the L2ARC. However, the + * L1 hdr doesn't always exist when we change state to arc_anon before + * destroying a header, in which case reallocating to add the L1 hdr is + * pointless. + */ + if (HDR_HAS_L1HDR(hdr)) { + old_state = hdr->b_l1hdr.b_state; + refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); + datacnt = hdr->b_l1hdr.b_datacnt; + } else { + old_state = arc_l2c_only; + refcnt = 0; + datacnt = 0; + } ASSERT(MUTEX_HELD(hash_lock)); ASSERT3P(new_state, !=, old_state); - ASSERT(refcnt == 0 || hdr->b_datacnt > 0); - ASSERT(hdr->b_datacnt == 0 || !GHOST_STATE(new_state)); - ASSERT(hdr->b_datacnt <= 1 || old_state != arc_anon); + ASSERT(refcnt == 0 || datacnt > 0); + ASSERT(!GHOST_STATE(new_state) || datacnt == 0); + ASSERT(old_state != arc_anon || datacnt <= 1); - from_delta = to_delta = hdr->b_datacnt * hdr->b_size; + from_delta = to_delta = datacnt * hdr->b_size; /* * If this buffer is evictable, transfer it from the * old state list to the new state list. */ if (refcnt == 0) { - if (old_state != arc_anon) { + if (old_state != arc_anon && old_state != arc_l2c_only) { int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); - uint64_t *size = &old_state->arcs_lsize[hdr->b_type]; + uint64_t *size = &old_state->arcs_lsize[buftype]; if (use_mutex) mutex_enter(&old_state->arcs_mtx); - ASSERT(list_link_active(&hdr->b_arc_node)); - list_remove(&old_state->arcs_list[hdr->b_type], hdr); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(list_link_active(&hdr->b_l1hdr.b_arc_node)); + list_remove(&old_state->arcs_list[buftype], hdr); /* * If prefetching out of the ghost cache, * we will have a non-zero datacnt. */ - if (GHOST_STATE(old_state) && hdr->b_datacnt == 0) { + if (GHOST_STATE(old_state) && datacnt == 0) { /* ghost elements have a ghost size */ - ASSERT(hdr->b_buf == NULL); + ASSERT(hdr->b_l1hdr.b_buf == NULL); from_delta = hdr->b_size; } ASSERT3U(*size, >=, from_delta); @@ -1397,20 +1628,26 @@ arc_change_state(arc_state_t *new_state, if (use_mutex) mutex_exit(&old_state->arcs_mtx); } - if (new_state != arc_anon) { + if (new_state != arc_anon && new_state != arc_l2c_only) { int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); - uint64_t *size = &new_state->arcs_lsize[hdr->b_type]; + uint64_t *size = &new_state->arcs_lsize[buftype]; + /* + * An L1 header always exists here, since if we're + * moving to some L1-cached state (i.e. not l2c_only or + * anonymous), we realloc the header to add an L1hdr + * beforehand. + */ + ASSERT(HDR_HAS_L1HDR(hdr)); if (use_mutex) mutex_enter(&new_state->arcs_mtx); - list_insert_head(&new_state->arcs_list[hdr->b_type], - hdr); + list_insert_head(&new_state->arcs_list[buftype], hdr); /* ghost elements have a ghost size */ if (GHOST_STATE(new_state)) { - ASSERT(hdr->b_datacnt == 0); - ASSERT(hdr->b_buf == NULL); + ASSERT0(datacnt); + ASSERT(hdr->b_l1hdr.b_buf == NULL); to_delta = hdr->b_size; } atomic_add_64(size, to_delta); @@ -1424,20 +1661,22 @@ arc_change_state(arc_state_t *new_state, if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) buf_hash_remove(hdr); - /* adjust state sizes */ - if (to_delta) + /* adjust state sizes (ignore arc_l2c_only) */ + if (to_delta && new_state != arc_l2c_only) atomic_add_64(&new_state->arcs_size, to_delta); - if (from_delta) { + if (from_delta && old_state != arc_l2c_only) { ASSERT3U(old_state->arcs_size, >=, from_delta); atomic_add_64(&old_state->arcs_size, -from_delta); } - hdr->b_state = new_state; + if (HDR_HAS_L1HDR(hdr)) + hdr->b_l1hdr.b_state = new_state; - /* adjust l2arc hdr stats */ - if (new_state == arc_l2c_only) - l2arc_hdr_stat_add(); - else if (old_state == arc_l2c_only) - l2arc_hdr_stat_remove(); + /* + * L2 headers should never be on the L2 state list since they don't + * have L1 headers allocated. + */ + ASSERT(list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && + list_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); } void @@ -1493,31 +1732,36 @@ arc_space_return(uint64_t space, arc_spa } arc_buf_t * -arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) +arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) { arc_buf_hdr_t *hdr; arc_buf_t *buf; ASSERT3U(size, >, 0); - hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); + hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); ASSERT(BUF_EMPTY(hdr)); + ASSERT3P(hdr->b_freeze_cksum, ==, NULL); hdr->b_size = size; - hdr->b_type = type; hdr->b_spa = spa_load_guid(spa); - hdr->b_state = arc_anon; - hdr->b_arc_access = 0; + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; buf->b_next = NULL; - hdr->b_buf = buf; + + hdr->b_flags = arc_bufc_to_flags(type); + hdr->b_flags |= ARC_FLAG_HAS_L1HDR; + + hdr->b_l1hdr.b_buf = buf; + hdr->b_l1hdr.b_state = arc_anon; + hdr->b_l1hdr.b_arc_access = 0; + hdr->b_l1hdr.b_datacnt = 1; + arc_get_data_buf(buf); - hdr->b_datacnt = 1; - hdr->b_flags = 0; - ASSERT(refcount_is_zero(&hdr->b_refcnt)); - (void) refcount_add(&hdr->b_refcnt, tag); + ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); return (buf); } @@ -1550,8 +1794,9 @@ arc_return_buf(arc_buf_t *buf, void *tag arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(buf->b_data != NULL); - (void) refcount_add(&hdr->b_refcnt, tag); - (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); + ASSERT(HDR_HAS_L1HDR(hdr)); + (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); + (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); atomic_add_64(&arc_loaned_bytes, -hdr->b_size); } @@ -1560,12 +1805,12 @@ arc_return_buf(arc_buf_t *buf, void *tag void arc_loan_inuse_buf(arc_buf_t *buf, void *tag) { - arc_buf_hdr_t *hdr; + arc_buf_hdr_t *hdr = buf->b_hdr; ASSERT(buf->b_data != NULL); - hdr = buf->b_hdr; - (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); - (void) refcount_remove(&hdr->b_refcnt, tag); + ASSERT(HDR_HAS_L1HDR(hdr)); + (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); + (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); buf->b_efunc = NULL; buf->b_private = NULL; @@ -1579,15 +1824,16 @@ arc_buf_clone(arc_buf_t *from) arc_buf_hdr_t *hdr = from->b_hdr; uint64_t size = hdr->b_size; - ASSERT(hdr->b_state != arc_anon); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(hdr->b_l1hdr.b_state != arc_anon); buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; buf->b_efunc = NULL; buf->b_private = NULL; - buf->b_next = hdr->b_buf; - hdr->b_buf = buf; + buf->b_next = hdr->b_l1hdr.b_buf; + hdr->b_l1hdr.b_buf = buf; arc_get_data_buf(buf); bcopy(from->b_data, buf->b_data, size); @@ -1597,11 +1843,11 @@ arc_buf_clone(arc_buf_t *from) * then track the size and number of duplicates. These stats will be * updated as duplicate buffers are created and destroyed. */ - if (hdr->b_type == ARC_BUFC_DATA) { + if (HDR_ISTYPE_DATA(hdr)) { ARCSTAT_BUMP(arcstat_duplicate_buffers); ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); } - hdr->b_datacnt += 1; + hdr->b_l1hdr.b_datacnt += 1; return (buf); } @@ -1624,17 +1870,20 @@ arc_buf_add_ref(arc_buf_t *buf, void* ta hash_lock = HDR_LOCK(buf->b_hdr); mutex_enter(hash_lock); hdr = buf->b_hdr; + ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); mutex_exit(&buf->b_evict_lock); *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201503201839.t2KId8x1086495>