Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 18 Jul 2016 06:57:24 +0000 (UTC)
From:      Andriy Gapon <avg@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-vendor@freebsd.org
Subject:   svn commit: r302991 - vendor-sys/illumos/dist/uts/common/fs/zfs vendor-sys/illumos/dist/uts/common/fs/zfs/sys vendor/illumos/dist/cmd/zdb vendor/illumos/dist/cmd/ztest
Message-ID:  <201607180657.u6I6vOxT008306@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: avg
Date: Mon Jul 18 06:57:24 2016
New Revision: 302991
URL: https://svnweb.freebsd.org/changeset/base/302991

Log:
  6950 ARC should cache compressed data
  
  illumos/illumos-gate@dcbf3bd6a1f1360fc1afcee9e22c6dcff7844bf2
  https://github.com/illumos/illumos-gate/commit/dcbf3bd6a1f1360fc1afcee9e22c6dcff7844bf2
  
  https://www.illumos.org/issues/6950
    When reading compressed data from disk, the ARC should keep the compressed
    block cached and only decompress it when consumers access the block. The
    uncompressed data should be short-lived allowing the ARC to cache a much larger
    amount of data. The DMU would also maintain a smaller cache of uncompressed
    blocks to minimize the impact of decompressing frequently accessed blocks.
  
  Reviewed by: Prakash Surya <prakash.surya@delphix.com>
  Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
  Reviewed by: Matt Ahrens <mahrens@delphix.com>
  Reviewed by: Paul Dagnelie <pcd@delphix.com>
  Reviewed by: Don Brady <don.brady@intel.com>
  Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
  Approved by: Richard Lowe <richlowe@richlowe.net>
  Author: George Wilson <george.wilson@delphix.com>

Modified:
  vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_diff.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_send.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_traverse.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dnode.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dnode_sync.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/dsl_scan.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/refcount.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/arc.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/refcount.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/spa.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zio_checksum.h
  vendor-sys/illumos/dist/uts/common/fs/zfs/zil.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zio.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zio_checksum.c

Changes in other areas also in this revision:
Modified:
  vendor/illumos/dist/cmd/zdb/zdb.c
  vendor/illumos/dist/cmd/ztest/ztest.c

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c	Mon Jul 18 06:47:08 2016	(r302990)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/arc.c	Mon Jul 18 06:57:24 2016	(r302991)
@@ -120,9 +120,134 @@
  *	- ARC header release, as it removes from L2ARC buflists
  */
 
+/*
+ * ARC operation:
+ *
+ * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
+ * This structure can point either to a block that is still in the cache or to
+ * one that is only accessible in an L2 ARC device, or it can provide
+ * information about a block that was recently evicted. If a block is
+ * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
+ * information to retrieve it from the L2ARC device. This information is
+ * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
+ * that is in this state cannot access the data directly.
+ *
+ * Blocks that are actively being referenced or have not been evicted
+ * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
+ * the arc_buf_hdr_t that will point to the data block in memory. A block can
+ * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
+ * caches data in two ways -- in a list of arc buffers (arc_buf_t) and
+ * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata).
+ * Each arc buffer (arc_buf_t) is being actively accessed by a specific ARC
+ * consumer, and always contains uncompressed data. The ARC will provide
+ * references to this data and will keep it cached until it is no longer in
+ * use. Typically, the arc will try to cache only the L1ARC's physical data
+ * block and will aggressively evict any arc_buf_t that is no longer referenced.
+ * The amount of memory consumed by the arc_buf_t's can be seen via the
+ * "overhead_size" kstat.
+ *
+ *
+ *                arc_buf_hdr_t
+ *                +-----------+
+ *                |           |
+ *                |           |
+ *                |           |
+ *                +-----------+
+ * l2arc_buf_hdr_t|           |
+ *                |           |
+ *                +-----------+
+ * l1arc_buf_hdr_t|           |
+ *                |           |                 arc_buf_t
+ *                |    b_buf  +------------>+---------+      arc_buf_t
+ *                |           |             |b_next   +---->+---------+
+ *                |  b_pdata  +-+           |---------|     |b_next   +-->NULL
+ *                +-----------+ |           |         |     +---------+
+ *                              |           |b_data   +-+   |         |
+ *                              |           +---------+ |   |b_data   +-+
+ *                              +->+------+             |   +---------+ |
+ *                   (potentially) |      |             |               |
+ *                     compressed  |      |             |               |
+ *                        data     +------+             |               v
+ *                                                      +->+------+     +------+
+ *                                            uncompressed |      |     |      |
+ *                                                data     |      |     |      |
+ *                                                         +------+     +------+
+ *
+ * The L1ARC's data pointer, however, may or may not be uncompressed. The
+ * ARC has the ability to store the physical data (b_pdata) associated with
+ * the DVA of the arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk
+ * physical block, it will match its on-disk compression characteristics.
+ * If the block on-disk is compressed, then the physical data block
+ * in the cache will also be compressed and vice-versa. This behavior
+ * can be disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
+ * compressed ARC functionality is disabled, the b_pdata will point to an
+ * uncompressed version of the on-disk data.
+ *
+ * When a consumer reads a block, the ARC must first look to see if the
+ * arc_buf_hdr_t is cached. If the hdr is cached and already has an arc_buf_t,
+ * then an additional arc_buf_t is allocated and the uncompressed data is
+ * bcopied from the existing arc_buf_t. If the hdr is cached but does not
+ * have an arc_buf_t, then the ARC allocates a new arc_buf_t and decompresses
+ * the b_pdata contents into the arc_buf_t's b_data. If the arc_buf_hdr_t's
+ * b_pdata is not compressed, then the block is shared with the newly
+ * allocated arc_buf_t. This block sharing only occurs with one arc_buf_t
+ * in the arc buffer chain. Sharing the block reduces the memory overhead
+ * required when the hdr is caching uncompressed blocks or the compressed
+ * arc functionality has been disabled via 'zfs_compressed_arc_enabled'.
+ *
+ * The diagram below shows an example of an uncompressed ARC hdr that is
+ * sharing its data with an arc_buf_t:
+ *
+ *                arc_buf_hdr_t
+ *                +-----------+
+ *                |           |
+ *                |           |
+ *                |           |
+ *                +-----------+
+ * l2arc_buf_hdr_t|           |
+ *                |           |
+ *                +-----------+
+ * l1arc_buf_hdr_t|           |
+ *                |           |                 arc_buf_t    (shared)
+ *                |    b_buf  +------------>+---------+      arc_buf_t
+ *                |           |             |b_next   +---->+---------+
+ *                |  b_pdata  +-+           |---------|     |b_next   +-->NULL
+ *                +-----------+ |           |         |     +---------+
+ *                              |           |b_data   +-+   |         |
+ *                              |           +---------+ |   |b_data   +-+
+ *                              +->+------+             |   +---------+ |
+ *                                 |      |             |               |
+ *                   uncompressed  |      |             |               |
+ *                        data     +------+             |               |
+ *                                    ^                 +->+------+     |
+ *                                    |       uncompressed |      |     |
+ *                                    |           data     |      |     |
+ *                                    |                    +------+     |
+ *                                    +---------------------------------+
+ *
+ * Writing to the arc requires that the ARC first discard the b_pdata
+ * since the physical block is about to be rewritten. The new data contents
+ * will be contained in the arc_buf_t (uncompressed). As the I/O pipeline
+ * performs the write, it may compress the data before writing it to disk.
+ * The ARC will be called with the transformed data and will bcopy the
+ * transformed on-disk block into a newly allocated b_pdata.
+ *
+ * When the L2ARC is in use, it will also take advantage of the b_pdata. The
+ * L2ARC will always write the contents of b_pdata to the L2ARC. This means
+ * that when compressed arc is enabled that the L2ARC blocks are identical
+ * to the on-disk block in the main data pool. This provides a significant
+ * advantage since the ARC can leverage the bp's checksum when reading from the
+ * L2ARC to determine if the contents are valid. However, if the compressed
+ * arc is disabled, then the L2ARC's block must be transformed to look
+ * like the physical block in the main data pool before comparing the
+ * checksum and determining its validity.
+ */
+
 #include <sys/spa.h>
 #include <sys/zio.h>
+#include <sys/spa_impl.h>
 #include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/refcount.h>
@@ -151,10 +276,6 @@ static kcondvar_t	arc_reclaim_thread_cv;
 static boolean_t	arc_reclaim_thread_exit;
 static kcondvar_t	arc_reclaim_waiters_cv;
 
-static kmutex_t		arc_user_evicts_lock;
-static kcondvar_t	arc_user_evicts_cv;
-static boolean_t	arc_user_evicts_thread_exit;
-
 uint_t arc_reduce_dnlc_percent = 3;
 
 /*
@@ -230,9 +351,10 @@ uint64_t zfs_arc_meta_min = 0;
 int zfs_arc_grow_retry = 0;
 int zfs_arc_shrink_shift = 0;
 int zfs_arc_p_min_shift = 0;
-int zfs_disable_dup_eviction = 0;
 int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
 
+boolean_t zfs_compressed_arc_enabled = B_TRUE;
+
 /*
  * Note that buffers can be in one of 6 states:
  *	ARC_anon	- anonymous (discussed below)
@@ -273,7 +395,7 @@ typedef struct arc_state {
 	/*
 	 * total amount of evictable data in this state
 	 */
-	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];
+	refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
 	/*
 	 * total amount of data in this state; this includes: evictable,
 	 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
@@ -338,6 +460,26 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_c_max;
 	kstat_named_t arcstat_size;
 	/*
+	 * Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata.
+	 * Note that the compressed bytes may match the uncompressed bytes
+	 * if the block is either not compressed or compressed arc is disabled.
+	 */
+	kstat_named_t arcstat_compressed_size;
+	/*
+	 * Uncompressed size of the data stored in b_pdata. If compressed
+	 * arc is disabled then this value will be identical to the stat
+	 * above.
+	 */
+	kstat_named_t arcstat_uncompressed_size;
+	/*
+	 * Number of bytes stored in all the arc_buf_t's. This is classified
+	 * as "overhead" since this data is typically short-lived and will
+	 * be evicted from the arc when it becomes unreferenced unless the
+	 * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
+	 * values have been set (see comment in dbuf.c for more information).
+	 */
+	kstat_named_t arcstat_overhead_size;
+	/*
 	 * Number of bytes consumed by internal ARC structures necessary
 	 * for tracking purposes; these structures are not actually
 	 * backed by ARC buffers. This includes arc_buf_hdr_t structures
@@ -482,20 +624,13 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_l2_evict_reading;
 	kstat_named_t arcstat_l2_evict_l1cached;
 	kstat_named_t arcstat_l2_free_on_write;
-	kstat_named_t arcstat_l2_cdata_free_on_write;
 	kstat_named_t arcstat_l2_abort_lowmem;
 	kstat_named_t arcstat_l2_cksum_bad;
 	kstat_named_t arcstat_l2_io_error;
 	kstat_named_t arcstat_l2_size;
 	kstat_named_t arcstat_l2_asize;
 	kstat_named_t arcstat_l2_hdr_size;
-	kstat_named_t arcstat_l2_compress_successes;
-	kstat_named_t arcstat_l2_compress_zeros;
-	kstat_named_t arcstat_l2_compress_failures;
 	kstat_named_t arcstat_memory_throttle_count;
-	kstat_named_t arcstat_duplicate_buffers;
-	kstat_named_t arcstat_duplicate_buffers_size;
-	kstat_named_t arcstat_duplicate_reads;
 	kstat_named_t arcstat_meta_used;
 	kstat_named_t arcstat_meta_limit;
 	kstat_named_t arcstat_meta_max;
@@ -537,6 +672,9 @@ static arc_stats_t arc_stats = {
 	{ "c_min",			KSTAT_DATA_UINT64 },
 	{ "c_max",			KSTAT_DATA_UINT64 },
 	{ "size",			KSTAT_DATA_UINT64 },
+	{ "compressed_size",		KSTAT_DATA_UINT64 },
+	{ "uncompressed_size",		KSTAT_DATA_UINT64 },
+	{ "overhead_size",		KSTAT_DATA_UINT64 },
 	{ "hdr_size",			KSTAT_DATA_UINT64 },
 	{ "data_size",			KSTAT_DATA_UINT64 },
 	{ "metadata_size",		KSTAT_DATA_UINT64 },
@@ -570,20 +708,13 @@ static arc_stats_t arc_stats = {
 	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
 	{ "l2_evict_l1cached",		KSTAT_DATA_UINT64 },
 	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
-	{ "l2_cdata_free_on_write",	KSTAT_DATA_UINT64 },
 	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
 	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
 	{ "l2_io_error",		KSTAT_DATA_UINT64 },
 	{ "l2_size",			KSTAT_DATA_UINT64 },
 	{ "l2_asize",			KSTAT_DATA_UINT64 },
 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
-	{ "l2_compress_successes",	KSTAT_DATA_UINT64 },
-	{ "l2_compress_zeros",		KSTAT_DATA_UINT64 },
-	{ "l2_compress_failures",	KSTAT_DATA_UINT64 },
 	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
-	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
-	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
-	{ "duplicate_reads",		KSTAT_DATA_UINT64 },
 	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
 	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
 	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
@@ -656,8 +787,12 @@ static arc_state_t	*arc_l2c_only;
 #define	arc_meta_used	ARCSTAT(arcstat_meta_used) /* size of metadata */
 #define	arc_meta_max	ARCSTAT(arcstat_meta_max) /* max size of metadata */
 
-#define	L2ARC_IS_VALID_COMPRESS(_c_) \
-	((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
+/* compressed size of entire arc */
+#define	arc_compressed_size	ARCSTAT(arcstat_compressed_size)
+/* uncompressed size of entire arc */
+#define	arc_uncompressed_size	ARCSTAT(arcstat_uncompressed_size)
+/* number of bytes in the arc from arc_buf_t's */
+#define	arc_overhead_size	ARCSTAT(arcstat_overhead_size)
 
 static int		arc_no_grow;	/* Don't try to grow cache size */
 static uint64_t		arc_tempreserve;
@@ -717,6 +852,7 @@ struct arc_write_callback {
  */
 typedef struct l1arc_buf_hdr {
 	kmutex_t		b_freeze_lock;
+	zio_cksum_t		*b_freeze_cksum;
 #ifdef ZFS_DEBUG
 	/*
 	 * used for debugging wtih kmem_flags - by allocating and freeing
@@ -727,9 +863,10 @@ typedef struct l1arc_buf_hdr {
 #endif
 
 	arc_buf_t		*b_buf;
-	uint32_t		b_datacnt;
+	uint32_t		b_bufcnt;
 	/* for waiting on writes to complete */
 	kcondvar_t		b_cv;
+	uint8_t			b_byteswap;
 
 	/* protected by arc state mutex */
 	arc_state_t		*b_state;
@@ -742,8 +879,7 @@ typedef struct l1arc_buf_hdr {
 	refcount_t		b_refcnt;
 
 	arc_callback_t		*b_acb;
-	/* temporary buffer holder for in-flight compressed data */
-	void			*b_tmp_cdata;
+	void			*b_pdata;
 } l1arc_buf_hdr_t;
 
 typedef struct l2arc_dev l2arc_dev_t;
@@ -752,9 +888,6 @@ typedef struct l2arc_buf_hdr {
 	/* protected by arc_buf_hdr mutex */
 	l2arc_dev_t		*b_dev;		/* L2ARC device */
 	uint64_t		b_daddr;	/* disk address, offset byte */
-	/* real alloc'd buffer size depending on b_compress applied */
-	int32_t			b_asize;
-	uint8_t			b_compress;
 
 	list_node_t		b_l2node;
 } l2arc_buf_hdr_t;
@@ -763,20 +896,37 @@ struct arc_buf_hdr {
 	/* protected by hash lock */
 	dva_t			b_dva;
 	uint64_t		b_birth;
-	/*
-	 * Even though this checksum is only set/verified when a buffer is in
-	 * the L1 cache, it needs to be in the set of common fields because it
-	 * must be preserved from the time before a buffer is written out to
-	 * L2ARC until after it is read back in.
-	 */
-	zio_cksum_t		*b_freeze_cksum;
 
+	arc_buf_contents_t	b_type;
 	arc_buf_hdr_t		*b_hash_next;
 	arc_flags_t		b_flags;
 
-	/* immutable */
-	int32_t			b_size;
-	uint64_t		b_spa;
+	/*
+	 * This field stores the size of the data buffer after
+	 * compression, and is set in the arc's zio completion handlers.
+	 * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
+	 *
+	 * While the block pointers can store up to 32MB in their psize
+	 * field, we can only store up to 32MB minus 512B. This is due
+	 * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
+	 * a field of zeros represents 512B in the bp). We can't use a
+	 * bias of 1 since we need to reserve a psize of zero, here, to
+	 * represent holes and embedded blocks.
+	 *
+	 * This isn't a problem in practice, since the maximum size of a
+	 * buffer is limited to 16MB, so we never need to store 32MB in
+	 * this field. Even in the upstream illumos code base, the
+	 * maximum size of a buffer is limited to 16MB.
+	 */
+	uint16_t		b_psize;
+
+	/*
+	 * This field stores the size of the data buffer before
+	 * compression, and cannot change once set. It is in units
+	 * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
+	 */
+	uint16_t		b_lsize;	/* immutable */
+	uint64_t		b_spa;		/* immutable */
 
 	/* L2ARC fields. Undefined when not in L2ARC. */
 	l2arc_buf_hdr_t		b_l2hdr;
@@ -784,9 +934,6 @@ struct arc_buf_hdr {
 	l1arc_buf_hdr_t		b_l1hdr;
 };
 
-static arc_buf_t *arc_eviction_list;
-static arc_buf_hdr_t arc_eviction_hdr;
-
 #define	GHOST_STATE(state)	\
 	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
 	(state) == arc_l2c_only)
@@ -795,25 +942,35 @@ static arc_buf_hdr_t arc_eviction_hdr;
 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
 #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
 #define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
-#define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
-#define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
+#define	HDR_COMPRESSION_ENABLED(hdr)	\
+	((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
 
 #define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_FLAG_L2CACHE)
-#define	HDR_L2COMPRESS(hdr)	((hdr)->b_flags & ARC_FLAG_L2COMPRESS)
 #define	HDR_L2_READING(hdr)	\
-	    (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&	\
-	    ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
+	(((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&	\
+	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
 #define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITING)
 #define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
 #define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
+#define	HDR_SHARED_DATA(hdr)	((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
 
 #define	HDR_ISTYPE_METADATA(hdr)	\
-	    ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
+	((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
 #define	HDR_ISTYPE_DATA(hdr)	(!HDR_ISTYPE_METADATA(hdr))
 
 #define	HDR_HAS_L1HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
 #define	HDR_HAS_L2HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
 
+/* For storing compression mode in b_flags */
+#define	HDR_COMPRESS_OFFSET	(highbit64(ARC_FLAG_COMPRESS_0) - 1)
+
+#define	HDR_GET_COMPRESS(hdr)	((enum zio_compress)BF32_GET((hdr)->b_flags, \
+	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
+#define	HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
+	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
+
+#define	ARC_BUF_LAST(buf)	((buf)->b_next == NULL)
+
 /*
  * Other sizes
  */
@@ -866,16 +1023,6 @@ uint64_t zfs_crc64_table[256];
 #define	L2ARC_FEED_SECS		1		/* caching interval secs */
 #define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
 
-/*
- * Used to distinguish headers that are being process by
- * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk
- * address. This can happen when the header is added to the l2arc's list
- * of buffers to write in the first stage of l2arc_write_buffers(), but
- * has not yet been written out which happens in the second stage of
- * l2arc_write_buffers().
- */
-#define	L2ARC_ADDR_UNSET	((uint64_t)(-1))
-
 #define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
 #define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
 
@@ -917,12 +1064,10 @@ static kmutex_t l2arc_free_on_write_mtx;
 static uint64_t l2arc_ndev;			/* number of devices */
 
 typedef struct l2arc_read_callback {
-	arc_buf_t		*l2rcb_buf;		/* read buffer */
-	spa_t			*l2rcb_spa;		/* spa */
+	arc_buf_hdr_t		*l2rcb_hdr;		/* read buffer */
 	blkptr_t		l2rcb_bp;		/* original blkptr */
 	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
 	int			l2rcb_flags;		/* original flags */
-	enum zio_compress	l2rcb_compress;		/* applied compress */
 } l2arc_read_callback_t;
 
 typedef struct l2arc_write_callback {
@@ -934,7 +1079,7 @@ typedef struct l2arc_data_free {
 	/* protected by l2arc_free_on_write_mtx */
 	void		*l2df_data;
 	size_t		l2df_size;
-	void		(*l2df_func)(void *, size_t);
+	arc_buf_contents_t l2df_type;
 	list_node_t	l2df_list_node;
 } l2arc_data_free_t;
 
@@ -942,21 +1087,22 @@ static kmutex_t l2arc_feed_thr_lock;
 static kcondvar_t l2arc_feed_thr_cv;
 static uint8_t l2arc_thread_exit;
 
-static void arc_get_data_buf(arc_buf_t *);
+static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
+static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
+static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr);
+static void arc_hdr_alloc_pdata(arc_buf_hdr_t *);
 static void arc_access(arc_buf_hdr_t *, kmutex_t *);
 static boolean_t arc_is_overflowing();
 static void arc_buf_watch(arc_buf_t *);
 
 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
 static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
+static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
+static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
 
 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
 static void l2arc_read_done(zio_t *);
 
-static boolean_t l2arc_compress_buf(arc_buf_hdr_t *);
-static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
-static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
-
 static uint64_t
 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 {
@@ -974,14 +1120,14 @@ buf_hash(uint64_t spa, const dva_t *dva,
 	return (crc);
 }
 
-#define	BUF_EMPTY(buf)						\
-	((buf)->b_dva.dva_word[0] == 0 &&			\
-	(buf)->b_dva.dva_word[1] == 0)
-
-#define	BUF_EQUAL(spa, dva, birth, buf)				\
-	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
-	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
-	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
+#define	HDR_EMPTY(hdr)						\
+	((hdr)->b_dva.dva_word[0] == 0 &&			\
+	(hdr)->b_dva.dva_word[1] == 0)
+
+#define	HDR_EQUAL(spa, dva, birth, hdr)				\
+	((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
+	((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
+	((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
 
 static void
 buf_discard_identity(arc_buf_hdr_t *hdr)
@@ -1003,7 +1149,7 @@ buf_hash_find(uint64_t spa, const blkptr
 	mutex_enter(hash_lock);
 	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
 	    hdr = hdr->b_hash_next) {
-		if (BUF_EQUAL(spa, dva, birth, hdr)) {
+		if (HDR_EQUAL(spa, dva, birth, hdr)) {
 			*lockp = hash_lock;
 			return (hdr);
 		}
@@ -1041,13 +1187,13 @@ buf_hash_insert(arc_buf_hdr_t *hdr, kmut
 
 	for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
 	    fhdr = fhdr->b_hash_next, i++) {
-		if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
+		if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
 			return (fhdr);
 	}
 
 	hdr->b_hash_next = buf_hash_table.ht_table[idx];
 	buf_hash_table.ht_table[idx] = hdr;
-	hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
+	arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
 
 	/* collect some hash table performance data */
 	if (i > 0) {
@@ -1075,12 +1221,12 @@ buf_hash_remove(arc_buf_hdr_t *hdr)
 
 	hdrp = &buf_hash_table.ht_table[idx];
 	while ((fhdr = *hdrp) != hdr) {
-		ASSERT(fhdr != NULL);
+		ASSERT3P(fhdr, !=, NULL);
 		hdrp = &fhdr->b_hash_next;
 	}
 	*hdrp = hdr->b_hash_next;
 	hdr->b_hash_next = NULL;
-	hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE;
+	arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
 
 	/* collect some hash table performance data */
 	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
@@ -1166,7 +1312,7 @@ hdr_full_dest(void *vbuf, void *unused)
 {
 	arc_buf_hdr_t *hdr = vbuf;
 
-	ASSERT(BUF_EMPTY(hdr));
+	ASSERT(HDR_EMPTY(hdr));
 	cv_destroy(&hdr->b_l1hdr.b_cv);
 	refcount_destroy(&hdr->b_l1hdr.b_refcnt);
 	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
@@ -1180,7 +1326,7 @@ hdr_l2only_dest(void *vbuf, void *unused
 {
 	arc_buf_hdr_t *hdr = vbuf;
 
-	ASSERT(BUF_EMPTY(hdr));
+	ASSERT(HDR_EMPTY(hdr));
 	arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
 }
 
@@ -1253,166 +1399,138 @@ retry:
 	}
 }
 
-/*
- * Transition between the two allocation states for the arc_buf_hdr struct.
- * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
- * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
- * version is used when a cache buffer is only in the L2ARC in order to reduce
- * memory usage.
- */
-static arc_buf_hdr_t *
-arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
-{
-	ASSERT(HDR_HAS_L2HDR(hdr));
-
-	arc_buf_hdr_t *nhdr;
-	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
-
-	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
-	    (old == hdr_l2only_cache && new == hdr_full_cache));
-
-	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
-
-	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
-	buf_hash_remove(hdr);
-
-	bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
-
-	if (new == hdr_full_cache) {
-		nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
-		/*
-		 * arc_access and arc_change_state need to be aware that a
-		 * header has just come out of L2ARC, so we set its state to
-		 * l2c_only even though it's about to change.
-		 */
-		nhdr->b_l1hdr.b_state = arc_l2c_only;
-
-		/* Verify previous threads set to NULL before freeing */
-		ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL);
-	} else {
-		ASSERT(hdr->b_l1hdr.b_buf == NULL);
-		ASSERT0(hdr->b_l1hdr.b_datacnt);
-
-		/*
-		 * If we've reached here, We must have been called from
-		 * arc_evict_hdr(), as such we should have already been
-		 * removed from any ghost list we were previously on
-		 * (which protects us from racing with arc_evict_state),
-		 * thus no locking is needed during this check.
-		 */
-		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
-
-		/*
-		 * A buffer must not be moved into the arc_l2c_only
-		 * state if it's not finished being written out to the
-		 * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field
-		 * might try to be accessed, even though it was removed.
-		 */
-		VERIFY(!HDR_L2_WRITING(hdr));
-		VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
+#define	ARC_MINTIME	(hz>>4) /* 62 ms */
 
-#ifdef ZFS_DEBUG
-		if (hdr->b_l1hdr.b_thawed != NULL) {
-			kmem_free(hdr->b_l1hdr.b_thawed, 1);
-			hdr->b_l1hdr.b_thawed = NULL;
-		}
-#endif
+static inline boolean_t
+arc_buf_is_shared(arc_buf_t *buf)
+{
+	boolean_t shared = (buf->b_data != NULL &&
+	    buf->b_data == buf->b_hdr->b_l1hdr.b_pdata);
+	IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
+	return (shared);
+}
 
-		nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
+static inline void
+arc_cksum_free(arc_buf_hdr_t *hdr)
+{
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
+	if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
+		kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
+		hdr->b_l1hdr.b_freeze_cksum = NULL;
 	}
-	/*
-	 * The header has been reallocated so we need to re-insert it into any
-	 * lists it was on.
-	 */
-	(void) buf_hash_insert(nhdr, NULL);
-
-	ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
-
-	mutex_enter(&dev->l2ad_mtx);
-
-	/*
-	 * We must place the realloc'ed header back into the list at
-	 * the same spot. Otherwise, if it's placed earlier in the list,
-	 * l2arc_write_buffers() could find it during the function's
-	 * write phase, and try to write it out to the l2arc.
-	 */
-	list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
-	list_remove(&dev->l2ad_buflist, hdr);
-
-	mutex_exit(&dev->l2ad_mtx);
-
-	/*
-	 * Since we're using the pointer address as the tag when
-	 * incrementing and decrementing the l2ad_alloc refcount, we
-	 * must remove the old pointer (that we're about to destroy) and
-	 * add the new pointer to the refcount. Otherwise we'd remove
-	 * the wrong pointer address when calling arc_hdr_destroy() later.
-	 */
-
-	(void) refcount_remove_many(&dev->l2ad_alloc,
-	    hdr->b_l2hdr.b_asize, hdr);
-
-	(void) refcount_add_many(&dev->l2ad_alloc,
-	    nhdr->b_l2hdr.b_asize, nhdr);
-
-	buf_discard_identity(hdr);
-	hdr->b_freeze_cksum = NULL;
-	kmem_cache_free(old, hdr);
-
-	return (nhdr);
+	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 }
 
-
-#define	ARC_MINTIME	(hz>>4) /* 62 ms */
-
 static void
 arc_cksum_verify(arc_buf_t *buf)
 {
+	arc_buf_hdr_t *hdr = buf->b_hdr;
 	zio_cksum_t zc;
 
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
-	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
-	if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) {
-		mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+	ASSERT(HDR_HAS_L1HDR(hdr));
+
+	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
+	if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
+		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 		return;
 	}
-	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc);
-	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
+	fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, &zc);
+	if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
 		panic("buffer modified while frozen!");
-	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 }
 
-static int
-arc_cksum_equal(arc_buf_t *buf)
+static boolean_t
+arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
 {
-	zio_cksum_t zc;
-	int equal;
+	enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp);
+	boolean_t valid_cksum;
 
-	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
-	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc);
-	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
-	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+	ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
+	VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
+
+	/*
+	 * We rely on the blkptr's checksum to determine if the block
+	 * is valid or not. When compressed arc is enabled, the l2arc
+	 * writes the block to the l2arc just as it appears in the pool.
+	 * This allows us to use the blkptr's checksum to validate the
+	 * data that we just read off of the l2arc without having to store
+	 * a separate checksum in the arc_buf_hdr_t. However, if compressed
+	 * arc is disabled, then the data written to the l2arc is always
+	 * uncompressed and won't match the block as it exists in the main
+	 * pool. When this is the case, we must first compress it if it is
+	 * compressed on the main pool before we can validate the checksum.
+	 */
+	if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) {
+		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+		uint64_t lsize = HDR_GET_LSIZE(hdr);
+		uint64_t csize;
+
+		void *cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr));
+		csize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
+		ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
+		if (csize < HDR_GET_PSIZE(hdr)) {
+			/*
+			 * Compressed blocks are always a multiple of the
+			 * smallest ashift in the pool. Ideally, we would
+			 * like to round up the csize to the next
+			 * spa_min_ashift but that value may have changed
+			 * since the block was last written. Instead,
+			 * we rely on the fact that the hdr's psize
+			 * was set to the psize of the block when it was
+			 * last written. We set the csize to that value
+			 * and zero out any part that should not contain
+			 * data.
+			 */
+			bzero((char *)cbuf + csize, HDR_GET_PSIZE(hdr) - csize);
+			csize = HDR_GET_PSIZE(hdr);
+		}
+		zio_push_transform(zio, cbuf, csize, HDR_GET_PSIZE(hdr), NULL);
+	}
 
-	return (equal);
+	/*
+	 * Block pointers always store the checksum for the logical data.
+	 * If the block pointer has the gang bit set, then the checksum
+	 * it represents is for the reconstituted data and not for an
+	 * individual gang member. The zio pipeline, however, must be able to
+	 * determine the checksum of each of the gang constituents so it
+	 * treats the checksum comparison differently than what we need
+	 * for l2arc blocks. This prevents us from using the
+	 * zio_checksum_error() interface directly. Instead we must call the
+	 * zio_checksum_error_impl() so that we can ensure the checksum is
+	 * generated using the correct checksum algorithm and accounts for the
+	 * logical I/O size and not just a gang fragment.
+	 */
+	valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
+	    BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size,
+	    zio->io_offset, NULL) == 0);
+	zio_pop_transforms(zio);
+	return (valid_cksum);
 }
 
 static void
-arc_cksum_compute(arc_buf_t *buf, boolean_t force)
+arc_cksum_compute(arc_buf_t *buf)
 {
-	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+
+	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
+	ASSERT(HDR_HAS_L1HDR(hdr));
 	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
-	if (buf->b_hdr->b_freeze_cksum != NULL) {
-		mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+	if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
+		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 		return;
 	}
-	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
-	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
-	    NULL, buf->b_hdr->b_freeze_cksum);
-	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+	hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
+	    KM_SLEEP);
+	fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL,
+	    hdr->b_l1hdr.b_freeze_cksum);
+	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 	arc_buf_watch(buf);
 }
 
@@ -1451,7 +1569,7 @@ arc_buf_watch(arc_buf_t *buf)
 		procctl_t ctl;
 		ctl.cmd = PCWATCH;
 		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
-		ctl.prwatch.pr_size = buf->b_hdr->b_size;
+		ctl.prwatch.pr_size = HDR_GET_LSIZE(buf->b_hdr);
 		ctl.prwatch.pr_wflags = WA_WRITE;
 		result = write(arc_procfd, &ctl, sizeof (ctl));
 		ASSERT3U(result, ==, sizeof (ctl));
@@ -1462,11 +1580,14 @@ arc_buf_watch(arc_buf_t *buf)
 static arc_buf_contents_t
 arc_buf_type(arc_buf_hdr_t *hdr)
 {
+	arc_buf_contents_t type;
 	if (HDR_ISTYPE_METADATA(hdr)) {
-		return (ARC_BUFC_METADATA);
+		type = ARC_BUFC_METADATA;
 	} else {
-		return (ARC_BUFC_DATA);
+		type = ARC_BUFC_DATA;
 	}
+	VERIFY3U(hdr->b_type, ==, type);
+	return (type);
 }
 
 static uint32_t
@@ -1488,29 +1609,29 @@ arc_bufc_to_flags(arc_buf_contents_t typ
 void
 arc_buf_thaw(arc_buf_t *buf)
 {
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+
 	if (zfs_flags & ZFS_DEBUG_MODIFY) {
-		if (buf->b_hdr->b_l1hdr.b_state != arc_anon)
+		if (hdr->b_l1hdr.b_state != arc_anon)
 			panic("modifying non-anon buffer!");
-		if (HDR_IO_IN_PROGRESS(buf->b_hdr))
+		if (HDR_IO_IN_PROGRESS(hdr))
 			panic("modifying buffer while i/o in progress!");
 		arc_cksum_verify(buf);
 	}
 
-	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
-	if (buf->b_hdr->b_freeze_cksum != NULL) {
-		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
-		buf->b_hdr->b_freeze_cksum = NULL;
-	}
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	arc_cksum_free(hdr);
 
+	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
 #ifdef ZFS_DEBUG
 	if (zfs_flags & ZFS_DEBUG_MODIFY) {
-		if (buf->b_hdr->b_l1hdr.b_thawed != NULL)
-			kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1);
-		buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
+		if (hdr->b_l1hdr.b_thawed != NULL)
+			kmem_free(hdr->b_l1hdr.b_thawed, 1);
+		hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
 	}
 #endif
 
-	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
 
 	arc_buf_unwatch(buf);
 }
@@ -1518,53 +1639,246 @@ arc_buf_thaw(arc_buf_t *buf)
 void
 arc_buf_freeze(arc_buf_t *buf)
 {
+	arc_buf_hdr_t *hdr = buf->b_hdr;
 	kmutex_t *hash_lock;
 
 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
 		return;
 
-	hash_lock = HDR_LOCK(buf->b_hdr);
+	hash_lock = HDR_LOCK(hdr);
 	mutex_enter(hash_lock);
 
-	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
-	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
-	arc_cksum_compute(buf, B_FALSE);
+	ASSERT(HDR_HAS_L1HDR(hdr));
+	ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL ||
+	    hdr->b_l1hdr.b_state == arc_anon);
+	arc_cksum_compute(buf);
 	mutex_exit(hash_lock);
 
 }
 
+/*
+ * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
+ * the following functions should be used to ensure that the flags are
+ * updated in a thread-safe way. When manipulating the flags either
+ * the hash_lock must be held or the hdr must be undiscoverable. This
+ * ensures that we're not racing with any other threads when updating
+ * the flags.
+ */
+static inline void
+arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
+{
+	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+	hdr->b_flags |= flags;
+}
+
+static inline void
+arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
+{
+	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+	hdr->b_flags &= ~flags;
+}
+
+/*
+ * Setting the compression bits in the arc_buf_hdr_t's b_flags is
+ * done in a special way since we have to clear and set bits
+ * at the same time. Consumers that wish to set the compression bits
+ * must use this function to ensure that the flags are updated in
+ * thread-safe manner.
+ */
+static void
+arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
+{
+	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+
+	/*
+	 * Holes and embedded blocks will always have a psize = 0 so
+	 * we ignore the compression of the blkptr and set the
+	 * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF.
+	 * Holes and embedded blocks remain anonymous so we don't
+	 * want to uncompress them. Mark them as uncompressed.
+	 */
+	if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
+		arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
+		HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
+		ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
+		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+	} else {
+		arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
+		HDR_SET_COMPRESS(hdr, cmp);
+		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
+		ASSERT(HDR_COMPRESSION_ENABLED(hdr));
+	}
+}
+
+static int
+arc_decompress(arc_buf_t *buf)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+	dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
+	int error;
+
+	if (arc_buf_is_shared(buf)) {
+		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+	} else if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) {
+		/*
+		 * The arc_buf_hdr_t is either not compressed or is
+		 * associated with an embedded block or a hole in which
+		 * case they remain anonymous.
+		 */
+		IMPLY(HDR_COMPRESSION_ENABLED(hdr), HDR_GET_PSIZE(hdr) == 0 ||
+		    HDR_GET_PSIZE(hdr) == HDR_GET_LSIZE(hdr));
+		ASSERT(!HDR_SHARED_DATA(hdr));
+		bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_LSIZE(hdr));
+	} else {
+		ASSERT(!HDR_SHARED_DATA(hdr));
+		ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
+		error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
+		    hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_PSIZE(hdr),
+		    HDR_GET_LSIZE(hdr));
+		if (error != 0) {
+			zfs_dbgmsg("hdr %p, compress %d, psize %d, lsize %d",
+			    hdr, HDR_GET_COMPRESS(hdr), HDR_GET_PSIZE(hdr),
+			    HDR_GET_LSIZE(hdr));
+			return (SET_ERROR(EIO));
+		}
+	}
+	if (bswap != DMU_BSWAP_NUMFUNCS) {

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201607180657.u6I6vOxT008306>